In [5]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
import spacy
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [6]:
# Load data from Google Sheets link
data_url = "https://docs.google.com/spreadsheets/d/117X6i53dKiO7w6kuA1g1TpdTlv1173h_dPlJt5cNNMU/export?format=csv"
dataset_original = pd.read_csv(data_url)

In [7]:
# Inspecting & Cleaning the dataset
dataset_cleaned_temp = dataset_original.copy()

In [8]:
# Checking missing data
print(dataset_cleaned_temp.isnull().sum())

id              0
job_title       0
location        0
connection      0
fit           104
dtype: int64


In [9]:
# Checking duplicates
print(dataset_cleaned_temp.duplicated().sum())

0


In [10]:
# Remove unnecessary words & Replace abbreviations
spacy_nlp = spacy.load('en_core_web_sm')
stemmer = PorterStemmer()
abbreviations_to_replace = {
    'GPHR': 'Global Professional in Human Resources',
    'CSR': 'Corporate Social Responsibility',
    'MES': 'Manufacturing Execution Systems',
    'SPHR': 'Senior Professional in Human Resources',
    'SVP': 'Senior Vice President',
    'GIS': 'Geographic Information System',
    'RRP': 'Reduced Risk Products',
    'CHRO': 'Chief Human Resources Officer',
    'HRIS': 'Human resources information system',
    'HR': 'Human resources',
}

def replace_abbreviations(sentence):
    replaced_sentence = sentence
    for abbreviation, replacement in abbreviations_to_replace.items():
        pattern = r'\b{}\b'.format(re.escape(abbreviation))
        replaced_sentence = re.sub(pattern, replacement, replaced_sentence, flags=re.IGNORECASE)
    return replaced_sentence

def clean_sentence(sentence):
    new_sentence = re.sub(r'[+*,.|(){}&\-\']', '', sentence)
    new_sentence = replace_abbreviations(new_sentence)
    words = new_sentence.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_sentence = " ".join([token.lemma_ for token in spacy_nlp(" ".join(stemmed_words)) if not token.is_stop])
    return lemmatized_sentence

dataset_cleaned_temp['job_title_cleaned'] = dataset_cleaned_temp['job_title'].apply(clean_sentence)

In [11]:
# Preprocessing
dataset_preprocessed = dataset_cleaned_temp.copy()

In [12]:
# Setup BERT & Utils
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

def get_bert_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        encoded_inputs = bert_tokenizer(sentence, padding=True, truncation=True, return_tensors='tf')
        outputs = bert_model(encoded_inputs)
        embeddings.append(tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy().reshape(-1))
    return np.array(embeddings)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [13]:
# Setup Doc2Vec
tagged_data = [TaggedDocument(words=clean_sentence(job_title).split(), tags=[str(i)]) for i, job_title in enumerate(dataset_preprocessed['job_title_cleaned'])]
doc2vec_model = Doc2Vec(vector_size=768, window=2, min_count=1, workers=4, epochs=40)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

def get_doc2vec_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        embeddings.append(doc2vec_model.infer_vector(sentence.split()))
    return np.array(embeddings)

In [14]:
# Encode and get similarity
def encode_and_get_similarity(data, queries, search_columns, output_columns):
    data = data.copy()
    bert_embeddings = {}
    doc2vec_embeddings = {}
    queries_embeddings = []
    doc2vec_queries_embeddings = []

    for index, query in enumerate(queries):
        query_cleaned = replace_abbreviations(query)
        query_cleaned = clean_sentence(query_cleaned)
        queries_embeddings.append(get_bert_embeddings([query_cleaned]))
        doc2vec_queries_embeddings.append(get_doc2vec_embeddings([query_cleaned]))

    queries_embeddings_mean = np.mean(queries_embeddings, axis=0)
    doc2vec_queries_embeddings_mean = np.mean(doc2vec_queries_embeddings, axis=0)

    for index, column in enumerate(search_columns):
        sentences = dataset_preprocessed[column].tolist()
        bert_embeddings[column] = get_bert_embeddings(sentences)
        doc2vec_embeddings[column] = get_doc2vec_embeddings(sentences)
        
        bert_cosine_similarities = cosine_similarity(queries_embeddings_mean, bert_embeddings[column])
        doc2vec_cosine_similarities = cosine_similarity(doc2vec_queries_embeddings_mean, doc2vec_embeddings[column])
        
        data[output_columns[0]] = bert_cosine_similarities[0]
        data[output_columns[1]] = doc2vec_cosine_similarities[0]
    
    return data

In [15]:
# Search queries/keywords
queries = [
    'aspiring human resources',
    'seeking human resources'
]

In [16]:
# Get embeddings & similarities
dataset_preprocessed = encode_and_get_similarity(dataset_preprocessed, queries, ['job_title_cleaned'], ['bert_similarity', 'doc2vec_similarity'])

In [17]:
# Calculate mean of BERT and Doc2Vec similarities
dataset_preprocessed['mean_score'] = dataset_preprocessed[['bert_similarity', 'doc2vec_similarity']].mean(axis=1)

In [18]:
# Sort the dataframe based on the new mean_score in descending order
dataset_preprocessed = dataset_preprocessed.sort_values(by='mean_score', ascending=False)

In [19]:
# First Rank for BERT and Doc2Vec using mean_score
first_rank_bert = dataset_preprocessed.sort_values(by='bert_similarity', ascending=False).head(20)
first_rank_doc2vec = dataset_preprocessed.sort_values(by='doc2vec_similarity', ascending=False).head(20)

In [20]:
# Starred Candidates
# Mark them as favorite/bookmark
starred_ids = [int(item) for item in input("Enter the ids of the candidates you want to star (separate by spaces): ").split()]

Enter the ids of the candidates you want to star (separate by spaces): 


In [21]:
# Second Rank (Re-Rank)
dataset_preprocessed.loc[dataset_preprocessed['id'].isin(starred_ids), 'is_starred'] = 1
dataset_preprocessed.loc[~dataset_preprocessed['id'].isin(starred_ids), 'is_starred'] = 0

def get_starred_score(data):
    data = data.copy()
    queries = data[data['is_starred'] == 1]['job_title_cleaned']
    similarities = []
    for query in queries:
        print('START: ' + query)
        data = encode_and_get_similarity(data, [query], ['job_title_cleaned'], ['starred_similarity'])
        similarities.append(data['starred_similarity'])
        
    starred_similarity = np.mean(similarities, axis=0)
    return starred_similarity

dataset_preprocessed['starred_similarity'] = get_starred_score(dataset_preprocessed)
dataset_preprocessed['mean_similarity_bert'] = dataset_preprocessed[['bert_similarity', 'starred_similarity']].mean(axis=1)
dataset_preprocessed['mean_similarity_doc2vec'] = dataset_preprocessed[['doc2vec_similarity', 'starred_similarity']].mean(axis=1)

final_rank_bert = dataset_preprocessed[['job_title', 'is_starred', 'bert_similarity', 'starred_similarity', 'mean_similarity_bert']].sort_values(by=['mean_similarity_bert', 'is_starred'], ascending=False).head(20)
final_rank_doc2vec = dataset_preprocessed[['job_title', 'is_starred', 'doc2vec_similarity', 'starred_similarity', 'mean_similarity_doc2vec']].sort_values(by=['mean_similarity_doc2vec', 'is_starred'], ascending=False).head(20)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [22]:
print("Final Rank using BERT embeddings:")
print(final_rank_bert)

print("Final Rank using Doc2Vec embeddings:")
print(final_rank_doc2vec)

Final Rank using BERT embeddings:
                                             job_title  is_starred  \
59                 Aspiring Human Resources Specialist         0.0   
35                 Aspiring Human Resources Specialist         0.0   
5                  Aspiring Human Resources Specialist         0.0   
48                 Aspiring Human Resources Specialist         0.0   
23                 Aspiring Human Resources Specialist         0.0   
98                    Seeking Human Resources Position         0.0   
67             Human Resources Specialist at Luxottica         0.0   
87                    Human Resources Management Major         0.0   
100              Human Resources Generalist at Loparex         0.0   
96               Aspiring Human Resources Professional         0.0   
45               Aspiring Human Resources Professional         0.0   
16               Aspiring Human Resources Professional         0.0   
57               Aspiring Human Resources Professional  

In [23]:
print("Top candidates sorted by mean similarity score:")
print(dataset_preprocessed[['job_title', 'bert_similarity', 'doc2vec_similarity', 'mean_score']].head(20))  # Display top sorted candidates

Top candidates sorted by mean similarity score:
                                             job_title  bert_similarity  \
59                 Aspiring Human Resources Specialist         0.918639   
35                 Aspiring Human Resources Specialist         0.918639   
5                  Aspiring Human Resources Specialist         0.918639   
98                    Seeking Human Resources Position         0.899539   
67             Human Resources Specialist at Luxottica         0.890844   
87                    Human Resources Management Major         0.883634   
48                 Aspiring Human Resources Specialist         0.918639   
23                 Aspiring Human Resources Specialist         0.918639   
100              Human Resources Generalist at Loparex         0.867990   
96               Aspiring Human Resources Professional         0.860986   
45               Aspiring Human Resources Professional         0.860986   
16               Aspiring Human Resources Profession