In [None]:
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/potential-talents - Aspiring human resources - seeking human resources.csv")

In [None]:
# Function to convert "500+" to "500" and leave the rest unchanged
def convert_connection(connection):
    if '+' in str(connection):
        return int(str(connection).replace('+', ''))
    else:
        return int(connection)

In [None]:
# Apply the conversion to the "connection" column
df['connection'] = df['connection'].apply(convert_connection)

In [None]:
# Scale the "connection" column to the range [0, 1]
scaler = MinMaxScaler()
df['scaled_connection'] = scaler.fit_transform(df[['connection']])


In [None]:
# Function for text preprocessing
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    porter_stemmer = PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words


In [None]:
# Apply preprocessing to each job title
df['processed_job_title'] = df['job_title'].apply(preprocess_text)

In [None]:
# Function to tokenize and tag documents
def tag_documents(df):
    tagged_data = []
    for i, row in df.iterrows():
        tagged_data.append(TaggedDocument(words=row['processed_job_title'], tags=[str(row.name)]))
    return tagged_data

In [None]:
# Tokenize and tag documents
tagged_data = tag_documents(df)
print(tagged_data)


[TaggedDocument(words=['2019', 'bauer', 'colleg', 'busi', 'graduat', 'magna', 'cum', 'laud', 'aspir', 'human', 'resourc', 'profession'], tags=['0']), TaggedDocument(words=['nativ', 'english', 'teacher', 'epik', 'english', 'program', 'korea'], tags=['1']), TaggedDocument(words=['aspir', 'human', 'resourc', 'profession'], tags=['2']), TaggedDocument(words=['peopl', 'develop', 'coordin', 'ryan'], tags=['3']), TaggedDocument(words=['advisori', 'board', 'member', 'celal', 'bayar', 'univers'], tags=['4']), TaggedDocument(words=['aspir', 'human', 'resourc', 'specialist'], tags=['5']), TaggedDocument(words=['student', 'humber', 'colleg', 'aspir', 'human', 'resourc', 'generalist'], tags=['6']), TaggedDocument(words=['hr', 'senior', 'specialist'], tags=['7']), TaggedDocument(words=['student', 'humber', 'colleg', 'aspir', 'human', 'resourc', 'generalist'], tags=['8']), TaggedDocument(words=['seek', 'human', 'resourc', 'hri', 'generalist', 'posit'], tags=['9']), TaggedDocument(words=['student', 'c

In [None]:
# Train a Doc2Vec model
model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)


In [None]:
# Get the embedding for "Aspiring human resources" (w1)
w1 = 'Aspiring human resources'
vector_w1 = model.infer_vector(preprocess_text(w1))

In [None]:
# Add another word "seeking human resources" (w2)
w2 = 'Seeking human resources'
vector_w2 = model.infer_vector(preprocess_text(w2))

In [None]:
# Calculate cosine similarity between w1 and w2
similarity_w1_w2 = cosine_similarity([vector_w1], [vector_w2])[0][0]

In [None]:
# Print the cosine similarity between w1 and w2
print(f'Cosine Similarity between "{w1}" and "{w2}": {similarity_w1_w2}')

Cosine Similarity between "Aspiring human resources" and "Seeking human resources": 0.9881563186645508


In [None]:
 # Calculate cosine similarity between w1, w2, and each job title

similarities_w1 = []
similarities_w2 = []
for i, row in df.iterrows():
    title = row['job_title']
    vector_title = model.infer_vector(preprocess_text(title))
    similarity_w1 = cosine_similarity([vector_w1], [vector_title])[0][0]
    similarity_w2 = cosine_similarity([vector_w2], [vector_title])[0][0]
    similarities_w1.append(similarity_w1)
    similarities_w2.append(similarity_w2)

In [None]:
# Add cosine similarities to the DataFrame
df['cosine_similarity_w1'] = similarities_w1
df['cosine_similarity_w2'] = similarities_w2

In [None]:
abc

NameError: name 'abc' is not defined

In [None]:
# Weighted sum of cosine_similarity and scaled_connection
weight_cosine = 0.8
weight_connection = 0.2
df['ranking_w1'] = weight_cosine * df['cosine_similarity_w1'] + weight_connection * df['scaled_connection']
df['ranking_w2'] = weight_cosine * df['cosine_similarity_w2'] + weight_connection * df['scaled_connection']

In [None]:
# Sort the DataFrame based on the "ranking_w1" column in descending order
df_sorted = df.sort_values(by='ranking_w1', ascending=False)

In [None]:
# Print the sorted DataFrame
print(df_sorted[['job_title', 'scaled_connection', 'cosine_similarity_w1', 'cosine_similarity_w2', 'ranking_w1', 'ranking_w2']])

                                             job_title  scaled_connection  \
74   Nortia Staffing is seeking Human Resources, Pa...           1.000000   
66   Human Resources, Staffing and Recruiting Profe...           1.000000   
103   Director Of Administration at Excellence Logging           1.000000   
77              Human Resources Generalist at Schwan's           1.000000   
102                     Always set them up for Success           1.000000   
..                                                 ...                ...   
0    2019 C.T. Bauer College of Business Graduate (...           0.168337   
30   2019 C.T. Bauer College of Business Graduate (...           0.168337   
14   2019 C.T. Bauer College of Business Graduate (...           0.168337   
18   2019 C.T. Bauer College of Business Graduate (...           0.168337   
56   2019 C.T. Bauer College of Business Graduate (...           0.168337   

     cosine_similarity_w1  cosine_similarity_w2  ranking_w1  ranking_w2  
7

### Adding Bert Embeddings

In [None]:
pip install sentence-transformers




In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
import re

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [None]:
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/potential-talents - Aspiring human resources - seeking human resources.csv")

In [None]:
# BERT model for sentence embeddings
model = SentenceTransformer('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Function for calculating cosine similarity between two sentences
def calculate_cosine_similarity(sentence1, sentence2):
    embeddings = model.encode([sentence1, sentence2], convert_to_tensor=True)
    similarity = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]
    return similarity

In [None]:
# Get BERT embeddings for "Aspiring human resources" and "Seeking human resources"
w1 = 'Aspiring human resources'
w2 = 'Seeking human resources'
bert_embedding_w1 = model.encode(w1, convert_to_tensor=True)
bert_embedding_w2 = model.encode(w2, convert_to_tensor=True)

In [None]:
# Calculate cosine similarity between w1, w2, and each job title
similarities_w1 = []
similarities_w2 = []
for i, row in df.iterrows():
    title = row['job_title']
    bert_embedding_title = model.encode(title, convert_to_tensor=True)
    similarity_w1 = cosine_similarity(bert_embedding_w1.reshape(1, -1), bert_embedding_title.reshape(1, -1))[0][0]
    similarity_w2 = cosine_similarity(bert_embedding_w2.reshape(1, -1), bert_embedding_title.reshape(1, -1))[0][0]
    similarities_w1.append(similarity_w1)
    similarities_w2.append(similarity_w2)

In [None]:
# Add cosine similarities to the DataFrame
df['cosine_similarity_bert_w1'] = similarities_w1
df['cosine_similarity_bert_w2'] = similarities_w2

In [None]:
# Print the DataFrame with job title, connection, and cosine similarities
print(df[['job_title', 'connection', 'cosine_similarity_bert_w1', 'cosine_similarity_bert_w2']])

                                             job_title connection  \
0    2019 C.T. Bauer College of Business Graduate (...         85   
1    Native English Teacher at EPIK (English Progra...      500+    
2                Aspiring Human Resources Professional         44   
3               People Development Coordinator at Ryan      500+    
4      Advisory Board Member at Celal Bayar University      500+    
..                                                 ...        ...   
99   Aspiring Human Resources Manager | Graduating ...        103   
100              Human Resources Generalist at Loparex      500+    
101   Business Intelligence and Analytics at Travelers         49   
102                     Always set them up for Success      500+    
103   Director Of Administration at Excellence Logging      500+    

     cosine_similarity_bert_w1  cosine_similarity_bert_w2  
0                     0.700951                   0.642038  
1                     0.621738                   0.

In [None]:
# Extract numeric values from 'connection' column using regular expression
df['connection'] = df['connection'].apply(lambda x: float(re.search(r'\d+', x).group(0)))


In [None]:
# Define weights for different components (adjust based on our preferences)
weight_connection = 0.3
weight_cosine_w1 = 0.5
weight_cosine_w2 = 0.2

In [None]:
# Calculate fitness score
df['fitness_score'] = (
    weight_connection * df['connection'] +
    weight_cosine_w1 * df['cosine_similarity_bert_w1'] +
    weight_cosine_w2 * df['cosine_similarity_bert_w2']
)

In [None]:
# Rank candidates based on fitness score
df['rank'] = df['fitness_score'].rank(ascending=False)

In [None]:
# Sort the DataFrame based on fitness score in descending order
df_sorted = df.sort_values(by='fitness_score', ascending=False)

In [None]:
# Print the sorted DataFrame with fitness score and rank
print(df_sorted[['job_title', 'connection', 'cosine_similarity_bert_w1', 'cosine_similarity_bert_w2', 'fitness_score', 'rank']])

                              job_title  connection  \
25                 HR Senior Specialist       500.0   
60                 HR Senior Specialist       500.0   
37                 HR Senior Specialist       500.0   
50                 HR Senior Specialist       500.0   
7                  HR Senior Specialist       500.0   
..                                  ...         ...   
48  Aspiring Human Resources Specialist         1.0   
35  Aspiring Human Resources Specialist         1.0   
59  Aspiring Human Resources Specialist         1.0   
5   Aspiring Human Resources Specialist         1.0   
23  Aspiring Human Resources Specialist         1.0   

    cosine_similarity_bert_w1  cosine_similarity_bert_w2  fitness_score   rank  
25                   0.843815                   0.783556     150.578619    3.0  
60                   0.843815                   0.783556     150.578619    3.0  
37                   0.843815                   0.783556     150.578619    3.0  
50             

In [None]:
# Suppose candidate with index 1 is starred, update its scores

starred_candidate_index = 1
df.at[starred_candidate_index, 'connection'] = 600  # Update connection score
df.at[starred_candidate_index, 'cosine_similarity_bert_w1'] = 0.9  # Update cosine similarity score

In [None]:
# Recalculate fitness score
df.at[starred_candidate_index, 'fitness_score'] = (
    weight_connection * df.at[starred_candidate_index, 'connection'] +
    weight_cosine_w1 * df.at[starred_candidate_index, 'cosine_similarity_bert_w1'] +
    weight_cosine_w2 * df.at[starred_candidate_index, 'cosine_similarity_bert_w2']
)

In [None]:
# Recalculate rank
df['rank'] = df['fitness_score'].rank(ascending=False)

In [None]:
# Re-sort the DataFrame based on fitness score in descending order
df_sorted = df.sort_values(by='fitness_score', ascending=False)


In [None]:
# Print the updated DataFrame with new rankings
print("\nUpdated DataFrame:")
print(df_sorted[['job_title', 'connection', 'fitness_score', 'rank']])


Updated DataFrame:
                                            job_title  connection  \
1   Native English Teacher at EPIK (English Progra...       600.0   
60                               HR Senior Specialist       500.0   
37                               HR Senior Specialist       500.0   
50                               HR Senior Specialist       500.0   
7                                HR Senior Specialist       500.0   
..                                                ...         ...   
48                Aspiring Human Resources Specialist         1.0   
35                Aspiring Human Resources Specialist         1.0   
59                Aspiring Human Resources Specialist         1.0   
5                 Aspiring Human Resources Specialist         1.0   
23                Aspiring Human Resources Specialist         1.0   

    fitness_score   rank  
1      180.567087    1.0  
60     150.578619    4.0  
37     150.578619    4.0  
50     150.578619    4.0  
7      150.57861

##BONUS QUESTIONS

###Ranking gets better with each starring action.

In [None]:
def calculate_fitness_score(df, weight_connection, weight_cosine_w1, weight_cosine_w2):
    # Calculate fitness score
    df['fitness_score'] = (
        weight_connection * df['connection'] +
        weight_cosine_w1 * df['cosine_similarity_bert_w1'] +
        weight_cosine_w2 * df['cosine_similarity_bert_w2']
    )
    return df

In [None]:
def rank_candidates(df):
    # Rank candidates based on fitness score
    df['rank'] = df['fitness_score'].rank(ascending=False)
    return df

In [None]:
def re_rank_starred_candidate(df, starred_candidate_index, updated_weights):
    # Update the connection score and cosine similarity score for the starred candidate
    df.at[starred_candidate_index, 'connection'] = 600  # Update connection score
    df.at[starred_candidate_index, 'cosine_similarity_bert_w1'] = 0.9  # Update cosine similarity score

    # Recalculate fitness score with updated weights
    df = calculate_fitness_score(df, *updated_weights)

    # Recalculate rank
    df = rank_candidates(df)

    return df

In [None]:
# Define weights for different components (adjust based on your preferences)
initial_weights = (0.3, 0.5, 0.2)
updated_weights = (0.3, 0.4, 0.3)  # Adjusted weights after starring action

In [None]:
# Initial ranking
df = calculate_fitness_score(df, *initial_weights)
df = rank_candidates(df)
print("Initial Ranking:")
print(df[['job_title', 'connection', 'fitness_score', 'rank']])

Initial Ranking:
                                             job_title  connection  \
0    2019 C.T. Bauer College of Business Graduate (...        85.0   
1    Native English Teacher at EPIK (English Progra...       600.0   
2                Aspiring Human Resources Professional        44.0   
3               People Development Coordinator at Ryan       500.0   
4      Advisory Board Member at Celal Bayar University       500.0   
..                                                 ...         ...   
99   Aspiring Human Resources Manager | Graduating ...       103.0   
100              Human Resources Generalist at Loparex       500.0   
101   Business Intelligence and Analytics at Travelers        49.0   
102                     Always set them up for Success       500.0   
103   Director Of Administration at Excellence Logging       500.0   

     fitness_score  rank  
0        25.978883  59.0  
1       180.567087   1.0  
2        13.846711  81.5  
3       150.574278  13.5  
4      

In [None]:
# Suppose candidate with index 1 is starred
starred_candidate_index = 1
df = re_rank_starred_candidate(df, starred_candidate_index, updated_weights)
print("\nAfter Starring Action:")
print(df[['job_title', 'connection', 'fitness_score', 'rank']])


After Starring Action:
                                             job_title  connection  \
0    2019 C.T. Bauer College of Business Graduate (...        85.0   
1    Native English Teacher at EPIK (English Progra...       600.0   
2                Aspiring Human Resources Professional        44.0   
3               People Development Coordinator at Ryan       500.0   
4      Advisory Board Member at Celal Bayar University       500.0   
..                                                 ...         ...   
99   Aspiring Human Resources Manager | Graduating ...       103.0   
100              Human Resources Generalist at Loparex       500.0   
101   Business Intelligence and Analytics at Travelers        49.0   
102                     Always set them up for Success       500.0   
103   Director Of Administration at Excellence Logging       500.0   

     fitness_score  rank  
0        25.972992  59.0  
1       180.535631   1.0  
2        13.838425  81.5  
3       150.569931  13.5  


In [None]:
df_sorted = df.sort_values(by='rank', ascending=True)
print(df_sorted[['job_title', 'connection', 'fitness_score', 'rank']])

                                            job_title  connection  \
1   Native English Teacher at EPIK (English Progra...       600.0   
9   Seeking Human Resources HRIS and Generalist Po...       500.0   
39  Seeking Human Resources HRIS and Generalist Po...       500.0   
52  Seeking Human Resources HRIS and Generalist Po...       500.0   
61  Seeking Human Resources HRIS and Generalist Po...       500.0   
..                                                ...         ...   
35                Aspiring Human Resources Specialist         1.0   
48                Aspiring Human Resources Specialist         1.0   
59                Aspiring Human Resources Specialist         1.0   
23                Aspiring Human Resources Specialist         1.0   
5                 Aspiring Human Resources Specialist         1.0   

    fitness_score   rank  
1      180.535631    1.0  
9      150.574404    3.5  
39     150.574404    3.5  
52     150.574404    3.5  
61     150.574404    3.5  
..       

## Filter out candidates which in the first place should not be in this list

In [None]:
# Exclusion criteria: Exclude candidates with 'Excluded' in their job title
exclusion_criteria = lambda x: x['job_title'].str.contains('Excluded', case=False)

In [None]:
def filter_candidates(df, exclusion_criteria):
    # Apply exclusion criteria to filter out candidates
    filtered_df = df[~exclusion_criteria(df)]
    return filtered_df

In [None]:
# Filter out candidates based on exclusion criteria
filtered_df = filter_candidates(df, exclusion_criteria)
print("\nFiltered Ranking (Excluding Candidates):")
print(filtered_df[['job_title', 'connection', 'fitness_score', 'rank']])


Filtered Ranking (Excluding Candidates):
                                             job_title  connection  \
0    2019 C.T. Bauer College of Business Graduate (...        85.0   
1    Native English Teacher at EPIK (English Progra...       600.0   
2                Aspiring Human Resources Professional        44.0   
3               People Development Coordinator at Ryan       500.0   
4      Advisory Board Member at Celal Bayar University       500.0   
..                                                 ...         ...   
99   Aspiring Human Resources Manager | Graduating ...       103.0   
100              Human Resources Generalist at Loparex       500.0   
101   Business Intelligence and Analytics at Travelers        49.0   
102                     Always set them up for Success       500.0   
103   Director Of Administration at Excellence Logging       500.0   

     fitness_score  rank  
0        25.972992  59.0  
1       180.535631   1.0  
2        13.838425  81.5  
3       1

## To determine a cut-off point that would work for other roles without losing high potential candidates

In [None]:
def find_cutoff(df, percentile_cutoff):
    # Find the cutoff point based on the given percentile
    cutoff_score = df['fitness_score'].quantile(percentile_cutoff)
    return cutoff_score

In [None]:
def filter_by_cutoff(df, cutoff_score):
    # Filter candidates based on the cutoff score
    filtered_df = df[df['fitness_score'] >= cutoff_score]
    return filtered_df

In [None]:
# Find the cutoff point based on a certain percentile (adjust as needed)
percentile_cutoff = 0.75  # Example: Keep top 25% of candidates
cutoff_score = find_cutoff(filtered_df, percentile_cutoff)
print(f"\nCutoff Score: {cutoff_score}")


Cutoff Score: 150.51058674752713


In [None]:
# Filter candidates based on the cutoff score
final_candidates = filter_by_cutoff(filtered_df, cutoff_score)
print("\nFinal Candidates:")
print(final_candidates[['job_title', 'connection', 'fitness_score', 'rank']])


Final Candidates:
                                             job_title  connection  \
1    Native English Teacher at EPIK (English Progra...       600.0   
3               People Development Coordinator at Ryan       500.0   
7                                 HR Senior Specialist       500.0   
9    Seeking Human Resources HRIS and Generalist Po...       500.0   
12   Human Resources Coordinator at InterContinenta...       500.0   
17              People Development Coordinator at Ryan       500.0   
21              People Development Coordinator at Ryan       500.0   
25                                HR Senior Specialist       500.0   
26   Aspiring Human Resources Management student se...       500.0   
28   Aspiring Human Resources Management student se...       500.0   
33              People Development Coordinator at Ryan       500.0   
37                                HR Senior Specialist       500.0   
39   Seeking Human Resources HRIS and Generalist Po...       500.0   
4