In [1]:
import pandas as pd
import numpy as np


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
from tqdm import tqdm
from bs4 import BeautifulSoup

# Ignore ipykernel warning
warnings.filterwarnings('ignore', category=DeprecationWarning, module='ipykernel')

import gensim
from gensim import corpora


In [2]:
applications = pd.read_parquet('../../Data/split_4_preprocessed.parquet')#.sample(1000, random_state=42) #Comment out the sample before training.
applications.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'application_concat', 'application_full_tokenized'],
      dtype='object')

In [3]:
applications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22053 entries, 66159 to 88211
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   opportunity_id                    22053 non-null  object
 1   application_id                    22053 non-null  object
 2   opportunity_brief_description     22053 non-null  object
 3   opportunity_description           22053 non-null  object
 4   opportunity_title                 22053 non-null  object
 5   application_pass_first_step       22053 non-null  object
 6   application_step_category         22053 non-null  object
 7   application_job_titles            22053 non-null  object
 8   application_job_responsibilities  22053 non-null  object
 9   application_education             22053 non-null  object
 10  application_reported_skills       22053 non-null  object
 11  application_concat                22053 non-null  object
 12  application_fu

In [4]:
def chunk_text(tokens, chunk_size):
    for i in range(0, len(tokens), chunk_size):
        yield tokens[i:i + chunk_size]

columns_to_chunk = ['opportunity_description', 'application_concat', 'application_job_responsibilities']

# Apply chunking to columns
for column in columns_to_chunk:
    applications[f'{column}_chunked'] = applications[column].apply(lambda x: list(chunk_text(nltk.word_tokenize(x), 30)))

In [5]:
for chunk in applications.sample(5)['opportunity_description_chunked']:
    print(chunk)

[['orkin', 'purpose', 'help', 'protect', 'world', 'live', 'work', 'play', 'outside', 'sale', 'professional', 'committed', 'purpose', 'outside', 'sale', 'professional', 'orkin', 'provided', 'opportunity', 'grow', 'sale', 'career', 'control', 'financial', 'opportunity', 'sale', 'professional', 'earn', 'top', 'wage'], ['recognition', 'opportunity', 'annual', 'award', 'trip', 'offer', 'award', 'winning', 'paid', 'training', 'rewarding', 'career', 'interested', 'chance', 'expand', 'knowledge', 'grow', 'earnings', 'check', 'sale', 'professional', 'position', 'turn', 'amazing', 'career', 'successful', 'sale', 'professional', 'schedule', 'sale'], ['appointment', 'meet', 'potential', 'customer', 'home', 'explain', 'orkin', 'rsquo', 'product', 'service', 'conduct', 'thorough', 'inspection', 'interior', 'exterior', 'area', 'potential', 'customer', 'rsquo', 'property', 'serve', 'problem', 'solver', 'customer', 'utilizing', 'depth', 'training', 'provided', 'decide', 'best'], ['overall', 'pest', 'so

In [6]:
applications.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'application_concat', 'application_full_tokenized',
       'opportunity_description_chunked', 'application_concat_chunked',
       'application_job_responsibilities_chunked'],
      dtype='object')

In [7]:
"""sima feedback:* Avoid redundant calculations: 
The function get_bert_embeddings is repeatedly calculating the embeddings for the full application text.
 Instead, we can calculate it once outside the loop and reuse it for each chunk.
* Avoid unnecessary data conversions: 
In the calculate_similarity function, the cosine_similarity is being calculated for single elements in a loop.
 We can instead calculate it once for all chunks at once, which will be more efficient.
* Use list comprehensions: 
List comprehensions are generally more readable and Pythonic than explicitly creating lists using loops.
* Use vectorized operations: 
Whenever possible, use vectorized operations with libraries like NumPy to speed up computations.
"""
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs[0][0].mean(0).detach().numpy()

# For calculating the most similar chunk of a job_description to the applicants' application
def calculate_similarity_to_chunks(text_list, full_app_text):
    full_app_embedding = get_bert_embeddings(full_app_text)
    chunk_texts = ["".join(chunk) for chunk in text_list]
    chunk_embeddings = np.array([get_bert_embeddings(chunk_text) for chunk_text in chunk_texts])
    similarities = cosine_similarity([full_app_embedding], chunk_embeddings)[0]
    
    df = pd.DataFrame({
        "chunk_number": range(len(text_list)),
        "chunk_content": chunk_texts,
        "full_app_content": [full_app_text] * len(text_list),
        "similarity_score": similarities
    })
    df['similarity_rank'] = df['similarity_score'].rank(ascending=False).astype(int)
    return df.sort_values(by='similarity_score', ascending=False)

"""In this optimized version, we calculate the full_app_embedding only once before the loop in the calculate_similarity function. 
Additionally, we use list comprehensions to generate the chunk_texts list, 
and we leverage NumPy's vectorized operations to calculate all chunk embeddings at once."""

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


"In this optimized version, we calculate the full_app_embedding only once before the loop in the calculate_similarity function. \nAdditionally, we use list comprehensions to generate the chunk_texts list, \nand we leverage NumPy's vectorized operations to calculate all chunk embeddings at once."

In [8]:
# string = """- Generated an additional $1.5M in revenue by identifying and closing new opportunities with existing clients
# - Increased sales by 20% by developing and implementing a new lead qualification tool
# - Improved customer satisfaction by 30% by implementing a new customer feedback loop
# - Reduced customer churn by 15% by implementing a new customer retention program
# - Increased sales by 10% by developing and implementing a new lead qualification tool
# - Improved customer satisfaction by 20% by implementing a new customer feedback loop
# - Reduced customer churn by 10% by implementing a new customer retention program"""

# string2 = """- Increased student engagement by 20% by implementing a new gamification program
# - Improved student test scores by 10% by implementing a new study plan
# - Increased student engagement by 10% by implementing a new gamification program
# - Improved student test scores by 5% by implementing a new study plan"""

# string3 = """- Increased user engagement by 20% by implementing a new gamification program
# - Improved user retention by 10% by implementing a new study plan
# - Increased user engagement by 10% by implementing a new gamification program
# - Improved user retention by 5% by implementing a new study plan"""
# string4 = "I flipped burgers for 2 years."

# strings_to_test = [string, string2, string3, string4]

# job_description = """We are looking for a software engineer to join our growing team.
#                     You will work closely with our development team to develop and implement new features and tools.
#                     You should be comfortable working in a fast-paced environment and taking on new tasks.
#                     You should also have excellent communication skills and be able to work independently or as part of a team.
#                     Responsibilities:
#                     - Develop and implement new features and tools
#                     - Work closely with our development team to ensure that all features are implemented correctly
#                     - Collaborate with other teams to ensure that all features are implemented correctly
#                     - Work closely with our finance team to ensure that all features are implemented correctly"""


# calculate_similarity_to_chunks(strings_to_test, job_description)

In [9]:
def calculate_similarity_using_dataframe(df, opportunity_id="random"):
    """ This function assumes the opportunity_id is the column we want to group by,
    and we are ranking application_job_responsibilities based on the opportunity_description.
    If the opportunity_id is not specified, we select a random opportunity_id from the DataFrame.

    This function assumes that the columns 'opportunity_description'
    and 'application_job_responsibilities'
    contain strings and exist in the DataFrame.

    The function will return a dataframe
    """
    # If 'random' is passed, select a random opportunity_id, otherwise filter by the specified opportunity_id
    if opportunity_id == "random":
        opportunity_id = df['opportunity_id'].sample(1).values[0]
    filtered_df = df[df['opportunity_id'] == opportunity_id]
    
    # Aggregate application_job_responsibilities into a list for the selected opportunity
    applications_list = filtered_df['application_job_responsibilities'].tolist()
    app_id_list = filtered_df['application_id'].tolist()
    
    # Extract the opportunity_description (assuming it's the same for all rows with the same opportunity_id)
    opportunity_description = filtered_df['opportunity_description'].iloc[0]
    opportunity_id = filtered_df['opportunity_id'].iloc[0]

    # Create the DataFrame with the similarity scores
    similarity_df = calculate_similarity_to_chunks(applications_list, opportunity_description)
    similarity_df['application_id'] = app_id_list
    similarity_df['opportunity_id'] = opportunity_id
    similarity_df.drop(columns = ['chunk_number'], inplace=True)
    similarity_df.columns = ['application_job_responsibilities',
                             'opportunity_description', 'similarity_score',
                             'similarity_rank', 'application_id', 'opportunity_id']
    return similarity_df

In [10]:
from tqdm import tqdm

In [11]:
all_similarity_dfs = []
for opp_id in tqdm(applications['opportunity_id'].unique()):
    similarity_df = calculate_similarity_using_dataframe(applications, opp_id)
    all_similarity_dfs.append(similarity_df)
final_similarity_df = pd.concat(all_similarity_dfs)

100%|██████████| 2733/2733 [1:29:44<00:00,  1.97s/it]  


In [12]:
final_similarity_df

Unnamed: 0,application_job_responsibilities,opportunity_description,similarity_score,similarity_rank,application_id,opportunity_id
2,transport patient desired destination assist d...,orkin purpose help protect world live work pla...,0.928475,1,gICe00fHwUKHC5ROv8WM4g==,x3PXnFA1GkCs0Cdz7q83zA==
11,post mod certified shift work experience forkl...,orkin purpose help protect world live work pla...,0.922358,2,+d+reLhP8EuYXLL1zIRNSA==,x3PXnFA1GkCs0Cdz7q83zA==
10,msg 45 hour week provided direct guidance lead...,orkin purpose help protect world live work pla...,0.920729,3,kfSob8AUjEKohQ08KFkytA==,x3PXnFA1GkCs0Cdz7q83zA==
0,30907 interviewed job candidate made staffing ...,orkin purpose help protect world live work pla...,0.919026,4,X+mUH4LbjUuqPrSylYtvBg==,x3PXnFA1GkCs0Cdz7q83zA==
1,served patient navigator hospital setting prim...,orkin purpose help protect world live work pla...,0.900760,5,UbeHp6bYf0++IDiC/qPJ7A==,x3PXnFA1GkCs0Cdz7q83zA==
...,...,...,...,...,...,...
0,correspond marketing team delegate project mee...,14 50 per hour orkin purpose help protect worl...,0.870251,1,o5eo91NUGkyr6AV8qPwm0Q==,+cPYnaJfp0mVjIH1QN1txg==
0,achievement maintaining 90 monthly schedule ad...,orkin purpose help protect world live work pla...,0.957768,1,YFcJAr+rRk2+VCxycZPrYQ==,/6DQU/YzVkeclZLE+XOddA==
1,,orkin purpose help protect world live work pla...,0.151570,2,jbLivGMoWkyqwUK+CgAQlA==,/6DQU/YzVkeclZLE+XOddA==
0,worked summer 2015 2016 2017 2019 contacted pl...,orkin purpose help protect world live work pla...,0.808699,1,A5P2w+E9f0yS/kz9/IBrlg==,6IBfh9iGpUyh75qUpkXX/w==


In [13]:
def merge_feature(dataframe):
    cols_to_merge = ['application_id', 'similarity_score', 'similarity_rank']
    dataframe = dataframe.merge(final_similarity_df[cols_to_merge], on='application_id', how='left')
    return dataframe

In [14]:
applications = merge_feature(applications)

In [15]:
applications

Unnamed: 0,opportunity_id,application_id,opportunity_brief_description,opportunity_description,opportunity_title,application_pass_first_step,application_step_category,application_job_titles,application_job_responsibilities,application_education,application_reported_skills,application_concat,application_full_tokenized,opportunity_description_chunked,application_concat_chunked,application_job_responsibilities_chunked,similarity_score,similarity_rank
0,x3PXnFA1GkCs0Cdz7q83zA==,gICe00fHwUKHC5ROv8WM4g==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,True,1,general manager assistant manager general mana...,30907 interviewed job candidate made staffing ...,High School Diploma++BPA,Sales++Cash++Customer service++Marketing++Rece...,general manager assistant manager general mana...,general manager assistant manager general mana...,"[[orkin, purpose, help, protect, world, live, ...","[[general, manager, assistant, manager, genera...","[[30907, interviewed, job, candidate, made, st...",0.928475,1
1,uEY0wW08R0WpiBkds/p4fg==,aaQVNXk5OEqPL7kZJJW6iw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,package handler,waa great job got home late evwnings pay good ...,,,package handler waa great job got home late ev...,package handler waa great job got home late ev...,"[[orkin, purpose, help, protect, world, live, ...","[[package, handler, waa, great, job, got, home...","[[waa, great, job, got, home, late, evwnings, ...",0.920917,1
2,gbzxt0dALU6x83ACFTSGEA==,mggx+MQSvUG6prEI0WkoAA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,logistics supervisor logistics warehouse super...,supervised operation multi shift food grade dc...,Bachelor of Science++Master of Arts,Metrics++Inventory++Training++Operations++Hr++...,logistics supervisor logistics warehouse super...,logistics supervisor logistics warehouse super...,"[[orkin, purpose, help, protect, world, live, ...","[[logistics, supervisor, logistics, warehouse,...","[[supervised, operation, multi, shift, food, g...",0.936904,1
3,Etqs/xDAX0SXrfmQA6+KBQ==,ewtbense/kyY62nuXSljHw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,service tech,diagonose repair vehicle suspension work aswel...,,,service tech diagonose repair vehicle suspensi...,service tech diagonose repair vehicle suspensi...,"[[orkin, purpose, help, protect, world, live, ...","[[service, tech, diagonose, repair, vehicle, s...","[[diagonose, repair, vehicle, suspension, work...",0.921125,1
4,lOZhR1k1Kk+OVz6n8r9vwA==,OuoS29IERk+mJTjddJNYSQ==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,security supervisor sharp shooter navigator kr...,patrolling protecting people property informat...,G.E.D,Cpr++Cpr certified++Documenting++Radio communi...,security supervisor sharp shooter navigator kr...,security supervisor sharp shooter navigator kr...,"[[orkin, purpose, help, protect, world, live, ...","[[security, supervisor, sharp, shooter, naviga...","[[patrolling, protecting, people, property, in...",0.938756,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22048,5hdHoZVOPUWJqeggPv5T1Q==,Yi4u0MpbW0CyhqdpFwQvmA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,delivery driver,delivered sold furniture,,,delivery driver delivered sold furniture,delivery driver delivered sold furniture,"[[orkin, purpose, help, protect, world, live, ...","[[delivery, driver, delivered, sold, furniture]]","[[delivered, sold, furniture]]",0.151831,14
22049,ULIT1Ap/WEeVR6sinqxo9w==,ENwvF9XYsU2yuoI0paNsfg==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,cook,cooking food keeping hot box stocked cleaning ...,Diploma,Driving,cook cooking food keeping hot box stocked clea...,cook cooking food keeping hot box stocked clea...,"[[orkin, purpose, help, protect, world, live, ...","[[cook, cooking, food, keeping, hot, box, stoc...","[[cooking, food, keeping, hot, box, stocked, c...",0.151831,13
22050,oAZjGHneVkKWKsiRaGh8Ag==,+9UVMbL9kUa6VstH0Bzgnw==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,auditor,,Diploma,Confined space training,auditor diploma confined space training,auditor diploma confined space training,"[[16, 00, per, hour, orkin, purpose, help, pro...","[[auditor, diploma, confined, space, training]]",[],0.629281,9
22051,oAZjGHneVkKWKsiRaGh8Ag==,7RyhIseedUGAVV4lfsKGow==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,machine operator thrower customer service repr...,simple machine operation back truck throwing t...,Diploma,Customer service++Customer service representat...,machine operator thrower customer service repr...,machine operator thrower customer service repr...,"[[16, 00, per, hour, orkin, purpose, help, pro...","[[machine, operator, thrower, customer, servic...","[[simple, machine, operation, back, truck, thr...",0.153763,10


In [16]:
applications.similarity_rank.value_counts()

similarity_rank
1      2740
2      2234
3      1882
4      1597
5      1361
       ... 
126       1
125       1
124       1
123       1
110       1
Name: count, Length: 219, dtype: int64

Above we see that we've been able to find the chunk that is most similar to the full opportunity. This helps us understand which part is the most important to compare to our applicantsm

## Creating a feature in which the similarity between the application and the opportunity is calculated using BERT embeddings
- The similarity is calculated using the cosine similarity between the embeddings of the application and the opportunity
- The applicants are sorted and then scored by their semantic similarity to the opportunity

In [17]:
applications.to_parquet("split_4_with_similarity_rankings.parquet")