In [None]:
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import torch
import re
import pandas as pd
import lightgbm as lgb
import numpy as np
from lightgbm import LGBMRanker
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/potential-talents - Aspiring human resources - seeking human resources.csv")

In [None]:
# Function to convert "500+" to "500" and leave the rest unchanged
def convert_connection(connection):
    if '+' in str(connection):
        return int(str(connection).replace('+', ''))
    else:
        return int(connection)

In [None]:
# Apply the conversion to the "connection" column
df['connection'] = df['connection'].apply(convert_connection)

In [None]:
# Scale the "connection" column to the range [0, 1]
scaler = MinMaxScaler()
df['scaled_connection'] = scaler.fit_transform(df[['connection']])

In [None]:
# Function for text preprocessing
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    porter_stemmer = PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

In [None]:
# Apply preprocessing to each job title
df['processed_job_title'] = df['job_title'].apply(preprocess_text)

In [None]:
# Function to tokenize and tag documents
def tag_documents(df):
    tagged_data = []
    for i, row in df.iterrows():
        tagged_data.append(TaggedDocument(words=row['processed_job_title'], tags=[str(row.name)]))
    return tagged_data

In [None]:
# Tokenize and tag documents
tagged_data = tag_documents(df)
print(tagged_data)

[TaggedDocument(words=['2019', 'bauer', 'colleg', 'busi', 'graduat', 'magna', 'cum', 'laud', 'aspir', 'human', 'resourc', 'profession'], tags=['0']), TaggedDocument(words=['nativ', 'english', 'teacher', 'epik', 'english', 'program', 'korea'], tags=['1']), TaggedDocument(words=['aspir', 'human', 'resourc', 'profession'], tags=['2']), TaggedDocument(words=['peopl', 'develop', 'coordin', 'ryan'], tags=['3']), TaggedDocument(words=['advisori', 'board', 'member', 'celal', 'bayar', 'univers'], tags=['4']), TaggedDocument(words=['aspir', 'human', 'resourc', 'specialist'], tags=['5']), TaggedDocument(words=['student', 'humber', 'colleg', 'aspir', 'human', 'resourc', 'generalist'], tags=['6']), TaggedDocument(words=['hr', 'senior', 'specialist'], tags=['7']), TaggedDocument(words=['student', 'humber', 'colleg', 'aspir', 'human', 'resourc', 'generalist'], tags=['8']), TaggedDocument(words=['seek', 'human', 'resourc', 'hri', 'generalist', 'posit'], tags=['9']), TaggedDocument(words=['student', 'c

In [None]:
# Train a Doc2Vec model
model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# Get the embedding for "Aspiring human resources" (w1)
w1 = 'Aspiring human resources'
vector_w1 = model.infer_vector(preprocess_text(w1))

In [None]:
# Add another word "seeking human resources" (w2)
w2 = 'Seeking human resources'
vector_w2 = model.infer_vector(preprocess_text(w2))

In [None]:
# Calculate cosine similarity between w1 and w2
similarity_w1_w2 = cosine_similarity([vector_w1], [vector_w2])[0][0]

In [None]:
# Print the cosine similarity between w1 and w2
print(f'Cosine Similarity between "{w1}" and "{w2}": {similarity_w1_w2}')

Cosine Similarity between "Aspiring human resources" and "Seeking human resources": 0.9877694845199585


In [None]:
 # Calculate cosine similarity between w1, w2, and each job title
similarities_w1 = []
similarities_w2 = []
for i, row in df.iterrows():
    title = row['job_title']
    vector_title = model.infer_vector(preprocess_text(title))
    similarity_w1 = cosine_similarity([vector_w1], [vector_title])[0][0]
    similarity_w2 = cosine_similarity([vector_w2], [vector_title])[0][0]
    similarities_w1.append(similarity_w1)
    similarities_w2.append(similarity_w2)

In [None]:
# Add cosine similarities to the DataFrame
df['cosine_similarity_w1'] = similarities_w1
df['cosine_similarity_w2'] = similarities_w2

In [None]:
# Weighted sum of cosine_similarity and scaled_connection
weight_cosine = 0.8
weight_connection = 0.2
df['ranking_w1'] = weight_cosine * df['cosine_similarity_w1'] + weight_connection * df['scaled_connection']
df['ranking_w2'] = weight_cosine * df['cosine_similarity_w2'] + weight_connection * df['scaled_connection']

In [None]:
# Sort the DataFrame based on the "ranking_w1" column in descending order
df_sorted = df.sort_values(by='ranking_w1', ascending=False)

In [None]:
# Print the sorted DataFrame
print(df_sorted[['job_title', 'scaled_connection', 'cosine_similarity_w1', 'cosine_similarity_w2', 'ranking_w1', 'ranking_w2']])

                                             job_title  scaled_connection  \
84   RRP Brand Portfolio Executive at JTI (Japan To...           1.000000   
70     Human Resources Generalist at ScottMadden, Inc.           1.000000   
100              Human Resources Generalist at Loparex           1.000000   
77              Human Resources Generalist at Schwan's           1.000000   
67             Human Resources Specialist at Luxottica           1.000000   
..                                                 ...                ...   
0    2019 C.T. Bauer College of Business Graduate (...           0.168337   
30   2019 C.T. Bauer College of Business Graduate (...           0.168337   
14   2019 C.T. Bauer College of Business Graduate (...           0.168337   
18   2019 C.T. Bauer College of Business Graduate (...           0.168337   
56   2019 C.T. Bauer College of Business Graduate (...           0.168337   

     cosine_similarity_w1  cosine_similarity_w2  ranking_w1  ranking_w2  
8

### Adding Bert Embeddings_Doctovec

In [None]:
pip install transformers



In [None]:
# Tokenize and tag documents for Doc2Vec
tagged_data_doc2vec = tag_documents(df)

In [None]:
# Train a Doc2Vec model
model_doc2vec = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=100)
model_doc2vec.build_vocab(tagged_data_doc2vec)
model_doc2vec.train(tagged_data_doc2vec, total_examples=model_doc2vec.corpus_count, epochs=model_doc2vec.epochs)

In [None]:
# Get the embeddings for "Aspiring human resources" and "Seeking human resources"
w1_doc2vec = model_doc2vec.infer_vector(preprocess_text('Aspiring human resources'))
w2_doc2vec = model_doc2vec.infer_vector(preprocess_text('Seeking human resources'))

In [None]:
# Calculate cosine similarity between w1_doc2vec and w2_doc2vec
cosine_similarity_doc2vec = cosine_similarity([w1_doc2vec], [w2_doc2vec])[0][0]

In [None]:
# Add cosine similarities to the DataFrame
df['cosine_similarity_doc2vec'] = cosine_similarity_doc2vec

In [None]:
df['scaled_connection_doc2vec'] = MinMaxScaler().fit_transform(df[['connection']])
df['ranking_doc2vec'] = 0.8 * df['cosine_similarity_doc2vec'] + 0.2 * df['scaled_connection_doc2vec']
df_sorted_doc2vec = df.sort_values(by='ranking_doc2vec', ascending=False)

In [None]:
# Display the DataFrame with Doc2Vec embeddings
print("DataFrame with Doc2Vec embeddings:")
print(df_sorted_doc2vec[['job_title', 'connection', 'cosine_similarity_doc2vec', 'ranking_doc2vec']])

DataFrame with Doc2Vec embeddings:
                                            job_title  connection  \
52  Seeking Human Resources HRIS and Generalist Po...         500   
58             People Development Coordinator at Ryan         500   
34    Advisory Board Member at Celal Bayar University         500   
37                               HR Senior Specialist         500   
39  Seeking Human Resources HRIS and Generalist Po...         500   
..                                                ...         ...   
48                Aspiring Human Resources Specialist           1   
35                Aspiring Human Resources Specialist           1   
59                Aspiring Human Resources Specialist           1   
5                 Aspiring Human Resources Specialist           1   
23                Aspiring Human Resources Specialist           1   

    cosine_similarity_doc2vec  ranking_doc2vec  
52                   0.987769         0.990216  
58                   0.987769         

In [None]:
# For BERT embeddings, need a pretrained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Function to get BERT embeddings
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        output = model_bert(**tokens)
    return output.pooler_output.numpy().squeeze()

In [None]:
# Get BERT embeddings for "Aspiring human resources" and "Seeking human resources"
w1_bert = get_bert_embedding('Aspiring human resources')
w2_bert = get_bert_embedding('Seeking human resources')

In [None]:
# Calculate cosine similarity between w1_bert and w2_bert
cosine_similarity_bert = cosine_similarity([w1_bert], [w2_bert])[0][0]

In [None]:
# Add cosine similarities to the DataFrame
df['cosine_similarity_bert'] = cosine_similarity_bert

In [None]:
df['scaled_connection_bert'] = MinMaxScaler().fit_transform(df[['connection']])
df['ranking_bert'] = 0.8 * df['cosine_similarity_bert'] + 0.2 * df['scaled_connection_bert']
df_sorted_bert = df.sort_values(by='ranking_bert', ascending=False)

In [None]:
# Display the DataFrame with BERT embeddings
print("\nDataFrame with BERT embeddings:")
print(df_sorted_bert[['job_title', 'connection', 'cosine_similarity_bert', 'ranking_bert']])


DataFrame with BERT embeddings:
                                            job_title  connection  \
52  Seeking Human Resources HRIS and Generalist Po...         500   
58             People Development Coordinator at Ryan         500   
34    Advisory Board Member at Celal Bayar University         500   
37                               HR Senior Specialist         500   
39  Seeking Human Resources HRIS and Generalist Po...         500   
..                                                ...         ...   
48                Aspiring Human Resources Specialist           1   
35                Aspiring Human Resources Specialist           1   
59                Aspiring Human Resources Specialist           1   
5                 Aspiring Human Resources Specialist           1   
23                Aspiring Human Resources Specialist           1   

    cosine_similarity_bert  ranking_bert  
52                0.990385      0.992308  
58                0.990385      0.992308  
34       

###Bert Embeddings for sentencetovec

In [None]:
pip install sentence-transformers



In [None]:
# BERT Model and Tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
 #Function to get BERT embeddings for a sentence
def get_bert_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    return outputs['last_hidden_state'].mean(dim=1).squeeze().detach().numpy()

In [None]:
# Function to get BERT embeddings for a given text
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Using mean of hidden states as embeddings
    return embeddings.numpy()

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
# BERT model for sentence embeddings
model = SentenceTransformer('distilbert-base-uncased')



In [None]:
# Function for calculating cosine similarity between two sentences
def calculate_cosine_similarity(sentence1, sentence2):
    embeddings = model.encode([sentence1, sentence2], convert_to_tensor=True)
    similarity = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]
    return similarity

In [None]:
# Get BERT embeddings for "Aspiring human resources" and "Seeking human resources"
w1 = 'Aspiring human resources'
w2 = 'Seeking human resources'
bert_embedding_w1 = model.encode(w1, convert_to_tensor=True)
bert_embedding_w2 = model.encode(w2, convert_to_tensor=True)

In [None]:
# Calculate cosine similarity between w1, w2, and each job title
similarities_w1 = []
similarities_w2 = []
for i, row in df.iterrows():
    title = row['job_title']
    bert_embedding_title = model.encode(title, convert_to_tensor=True)
    similarity_w1 = cosine_similarity(bert_embedding_w1.reshape(1, -1), bert_embedding_title.reshape(1, -1))[0][0]
    similarity_w2 = cosine_similarity(bert_embedding_w2.reshape(1, -1), bert_embedding_title.reshape(1, -1))[0][0]
    similarities_w1.append(similarity_w1)
    similarities_w2.append(similarity_w2)

In [None]:
# Add cosine similarities to the DataFrame
df['cosine_similarity_bert_w1'] = similarities_w1
df['cosine_similarity_bert_w2'] = similarities_w2

In [None]:
# Print the DataFrame with job title, connection, and cosine similarities
print(df[['job_title', 'connection', 'cosine_similarity_bert_w1', 'cosine_similarity_bert_w2']])

                                             job_title  connection  \
0    2019 C.T. Bauer College of Business Graduate (...          85   
1    Native English Teacher at EPIK (English Progra...         500   
2                Aspiring Human Resources Professional          44   
3               People Development Coordinator at Ryan         500   
4      Advisory Board Member at Celal Bayar University         500   
..                                                 ...         ...   
99   Aspiring Human Resources Manager | Graduating ...         103   
100              Human Resources Generalist at Loparex         500   
101   Business Intelligence and Analytics at Travelers          49   
102                     Always set them up for Success         500   
103   Director Of Administration at Excellence Logging         500   

     cosine_similarity_bert_w1  cosine_similarity_bert_w2  
0                     0.700951                   0.642038  
1                     0.621738         

In [None]:
# Define weights for different components (adjust based on our preferences)
weight_connection = 0.12
weight_cosine_w1 = 0.22
weight_cosine_w2 = 0.22

In [None]:
# Add cosine similarities to the DataFrame
df['cosine_similarity_w1'] = similarities_w1
df['cosine_similarity_w2'] = similarities_w2

In [None]:
# Calculate fitness score
df['fitness_score'] = (
    weight_connection * df['scaled_connection'] +
    weight_cosine_w1 * df['cosine_similarity_bert_w1'] +
    weight_cosine_w2 * df['cosine_similarity_bert_w2'] +
    weight_cosine_w1 * df['cosine_similarity_w1'] +
    weight_cosine_w2 * df['cosine_similarity_w2']
)

In [None]:
# Rank candidates based on fitness score
df['rank'] = df['fitness_score'].rank(ascending=False)

In [None]:
# Sort the DataFrame based on fitness score in descending order
df_sorted = df.sort_values(by='fitness_score', ascending=False)

In [None]:
# Print the sorted DataFrame with fitness score and rank
print(df_sorted[['job_title', 'connection', 'cosine_similarity_bert_w1', 'cosine_similarity_bert_w2', 'fitness_score', 'rank']])

                                            job_title  connection  \
29              Seeking Human Resources Opportunities         390   
27              Seeking Human Resources Opportunities         390   
9   Seeking Human Resources HRIS and Generalist Po...         500   
39  Seeking Human Resources HRIS and Generalist Po...         500   
61  Seeking Human Resources HRIS and Generalist Po...         500   
..                                                ...         ...   
90       Lead Official at Western Illinois University          39   
89  Undergraduate Research Assistant at Styczynski...         155   
92  Admissions Representative at Community medical...           9   
95  Student at Indiana University Kokomo - Busines...          19   
86  Bachelor of Science in Biology from Victoria U...          40   

    cosine_similarity_bert_w1  cosine_similarity_bert_w2  fitness_score   rank  
29                   0.929482                   0.961083       0.925396    1.5  
27       

In [None]:
# Suppose candidate with index 1 is starred, update its scores

starred_candidate_index = 1
df.at[starred_candidate_index, 'connection'] = 600  # Update connection score
df.at[starred_candidate_index, 'cosine_similarity_bert_w1'] = 0.9  # Update cosine similarity score

In [None]:
# Recalculate fitness score
df.at[starred_candidate_index, 'fitness_score'] = (
    weight_connection * df.at[starred_candidate_index, 'connection'] +
    weight_cosine_w1 * df.at[starred_candidate_index, 'cosine_similarity_bert_w1'] +
    weight_cosine_w2 * df.at[starred_candidate_index, 'cosine_similarity_bert_w2']
)

In [None]:
# Recalculate rank
df['rank'] = df['fitness_score'].rank(ascending=False)

In [None]:
# Re-sort the DataFrame based on fitness score in descending order
df = df.sort_values(by='fitness_score', ascending=False)


In [None]:
df = df.reset_index(drop=True)

##Applying LambdaMART with LightGBM



In [None]:
df['rank']= 0

In [None]:
df.loc[:9, 'rank'] = 1

In [None]:
df.head()

Unnamed: 0,id,job_title,location,connection,fit,scaled_connection,processed_job_title,cosine_similarity_w1,cosine_similarity_w2,ranking_w1,...,cosine_similarity_doc2vec,scaled_connection_doc2vec,ranking_doc2vec,cosine_similarity_bert,scaled_connection_bert,ranking_bert,cosine_similarity_bert_w1,cosine_similarity_bert_w2,fitness_score,rank
0,2,Native English Teacher at EPIK (English Progra...,Kanada,600,,1.0,"[nativ, english, teacher, epik, english, progr...",0.621738,0.585436,0.900059,...,0.987769,1.0,0.990216,0.990385,1.0,0.992308,0.9,0.585436,72.326796,1
1,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.779559,"[seek, human, resourc, opportun]",0.929482,0.961083,0.94586,...,0.987769,0.779559,0.946127,0.990385,0.779559,0.948219,0.929482,0.961083,0.925396,1
2,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.779559,"[seek, human, resourc, opportun]",0.929482,0.961083,0.946539,...,0.987769,0.779559,0.946127,0.990385,0.779559,0.948219,0.929482,0.961083,0.925396,1
3,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500,,1.0,"[seek, human, resourc, hri, generalist, posit]",0.829019,0.809323,0.982802,...,0.987769,1.0,0.990216,0.990385,1.0,0.992308,0.829019,0.809323,0.84087,1
4,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500,,1.0,"[seek, human, resourc, hri, generalist, posit]",0.829019,0.809323,0.985377,...,0.987769,1.0,0.990216,0.990385,1.0,0.992308,0.829019,0.809323,0.84087,1


In [None]:
# Features and labels
X = df[['cosine_similarity_w1', 'cosine_similarity_w2', 'cosine_similarity_bert_w1', 'cosine_similarity_bert_w2']]
y = df['rank']


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

# Convert data to LightGBM Dataset format
train_data = lgb.Dataset(X_train, label=y_train, group=[len(X_train)])
test_data = lgb.Dataset(X_test, label=y_test, group=[len(X_test)])

In [None]:
# Define parameters for LambdaMART
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'min_data_in_leaf': 1,
    'num_iterations': 100,
    'verbose': 0
}


In [None]:
# Train the LambdaMART model
ranker_model = lgb.train(params, train_data, valid_sets=[test_data])

# Save the model (optional)
ranker_model.save_model('lambdamart_model.txt')

# Predict on the test set
predictions = ranker_model.predict(X_test)

# Display the predictions
print("Predictions:", predictions)

Predictions: [-6.05184504 -6.05346867 -6.05565758 -6.0562113  -6.05064861 -6.05588588
 -6.05494591 -2.2464472   0.74724703 -6.05346867 -4.11553845 -6.05183129
 -6.05494591 -6.05588588 -6.05248139  5.24031901 -6.05373587  5.13807928
  0.74724703 -2.87643046 -6.05494591]


