In [5]:
import pandas as pd
import numpy as np


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
from tqdm import tqdm
from bs4 import BeautifulSoup

# Ignore ipykernel warning
warnings.filterwarnings('ignore', category=DeprecationWarning, module='ipykernel')

import gensim
from gensim import corpora


In [6]:
applications = pd.read_parquet('../../Data/split_4_preprocessed.parquet').sample(1000, random_state=42) #Comment out the sample before training.
applications.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'application_concat', 'application_full_tokenized'],
      dtype='object')

In [7]:
def chunk_text(tokens, chunk_size):
    for i in range(0, len(tokens), chunk_size):
        yield tokens[i:i + chunk_size]

columns_to_chunk = ['opportunity_description', 'application_concat', 'application_job_responsibilities']

# Apply chunking to columns
for column in columns_to_chunk:
    applications[f'{column}_chunked'] = applications[column].apply(lambda x: list(chunk_text(nltk.word_tokenize(x), 30)))

In [11]:
for chunk in applications['opportunity_description_chunked']:
    print(chunk)

[['earnings', 'opportunity', '50', '000', '55', '000', 'orkin', 'purpose', 'help', 'protect', 'world', 'live', 'work', 'play', 'service', 'technician', 'committed', 'purpose', 'orkin', 'pest', 'management', 'industry', 'leader', 'offer', 'extensive', 'training', 'service', 'technician', 'deliver', 'valuable'], ['service', 'customer', 'every', 'day', 'interested', 'chance', 'expand', 'knowledge', 'grow', 'career', 'well', 'financial', 'opportunity', 'check', 'position', 'turn', 'much', 'successful', 'candidate', 'complete', 'award', 'winning', 'company', 'paid', 'training', 'learn', 'skill', 'required', 'responsible', 'daily'], ['operation', 'pest', 'control', 'service', 'route', 'follow', 'route', 'schedule', 'within', 'assigned', 'territory', 'ensuring', 'timely', 'safe', 'arrival', 'customer', 'operate', 'company', 'vehicle', 'safely', 'ensure', 'cleanliness', 'vehicle', 'equipment', 'communicate', 'customer', 'ensure', 'satisfaction', 'discus', 'additional'], ['service', 'meet', 'cu

# This needs re-adjustments to make sure we can build the feature

# //// TEST CORNER m


In [32]:
applications.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'application_concat', 'application_full_tokenized',
       'opportunity_description_chunked', 'application_concat_chunked',
       'application_job_responsibilities_chunked'],
      dtype='object')

In [35]:
"""sima feedback:* Avoid redundant calculations: 
The function get_bert_embeddings is repeatedly calculating the embeddings for the full application text.
 Instead, we can calculate it once outside the loop and reuse it for each chunk.
* Avoid unnecessary data conversions: 
In the calculate_similarity function, the cosine_similarity is being calculated for single elements in a loop.
 We can instead calculate it once for all chunks at once, which will be more efficient.
* Use list comprehensions: 
List comprehensions are generally more readable and Pythonic than explicitly creating lists using loops.
* Use vectorized operations: 
Whenever possible, use vectorized operations with libraries like NumPy to speed up computations.
"""
import time
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import timeit
from sklearn.metrics.pairwise import cosine_similarity


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs[0][0].mean(0).detach().numpy()

# For calculating the most similar chunk of a job_description to the applicants' application
def calculate_similarity_to_chunks(text_list, full_app_text):
    full_app_embedding = get_bert_embeddings(full_app_text)
    chunk_texts = [" ".join(chunk) for chunk in text_list]
    chunk_embeddings = np.array([get_bert_embeddings(chunk_text) for chunk_text in chunk_texts])
    similarities = cosine_similarity([full_app_embedding], chunk_embeddings)[0]
    
    df = pd.DataFrame({
        "chunk_number": range(len(text_list)),
        "chunk_content": chunk_texts,
        "full_app_content": [full_app_text] * len(text_list),
        "similarity_score": similarities
    })
    return df.sort_values(by='similarity_score', ascending=False)

"""In this optimized version, we calculate the full_app_embedding only once before the loop in the calculate_similarity function. 
Additionally, we use list comprehensions to generate the chunk_texts list, 
and we leverage NumPy's vectorized operations to calculate all chunk embeddings at once."""

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


"In this optimized version, we calculate the full_app_embedding only once before the loop in the calculate_similarity function. \nAdditionally, we use list comprehensions to generate the chunk_texts list, \nand we leverage NumPy's vectorized operations to calculate all chunk embeddings at once."

In [36]:
calculate_similarity_to_chunks(??????)

KeyError: 0

In [30]:
# Use the function
df_similarity = calculate_similarity_to_chunks(applications['opportunity_description_chunked'].iloc[0], applications['application_job_responsibilities'].iloc[0])
df_similarity

Unnamed: 0,chunk_number,chunk_content,full_app_content,similarity_score
10,10,restaurant hotel apartment uniform route driver,,0.35888
5,5,insurance 401 k plan company match employee st...,,0.249359
0,0,earnings opportunity 50 000 55 000 orkin purpo...,,0.244999
6,6,1901 orkin global residential business service...,,0.227378
4,4,hand held device provided speed accuracy serve...,,0.216973
7,7,line business orkin financially stable growing...,,0.216876
1,1,service customer every day interested chance e...,,0.212956
8,8,must meet physical job requirement safely perf...,,0.211231
2,2,operation pest control service route follow ro...,,0.20886
3,3,service meet customer need complete pest manag...,,0.200172


In [25]:
df_similarity

Unnamed: 0,chunk_number,chunk_content,full_app_content,similarity_score
256,256,,,1.000000
1134,1134,,,1.000000
307,307,,,1.000000
699,699,,,1.000000
1262,1262,,,1.000000
...,...,...,...,...
2164,2164,n,,0.543267
2166,2166,n,,0.543267
864,864,n,,0.543267
1497,1497,n,,0.543267


Above we see that we've been able to find the chunk that is most similar to the full opportunity. This helps us understand which part is the most important to compare to our applicantsm

Looking into the data

In [None]:
applications.columns

In [None]:
applications.loc[applications['job_id'] == 'dmlXNFI0MEOlW+qh07E4iQ==']

## Creating a feature in which the similarity between the application and the opportunity is calculated using BERT embeddings
- The similarity is calculated using the cosine similarity between the embeddings of the application and the opportunity
- The applicants are sorted and then scored by their semantic similarity to the opportunity

In [None]:
def extract_index(chunk_content):
    try:
        return int(chunk_content.split('[')[0].replace(" ", ''))
    except ValueError:
        return None # Return None or an appropriate default value for unexpected values

# Add a new column to store the original index
applications['original_index'] = applications.index

# List to store the individual DataFrames
all_job_applicant_similarity_reduced = []

counter = 0
for job in applications['job_id'].unique():
    jobs_applications = applications.loc[applications['job_id'] == job].reset_index(drop=True)
    jobs_applications['full_app'] = jobs_applications['original_index'].astype(str) + jobs_applications['full_app']
    
    # Define list_of_full_apps
    list_of_full_apps = jobs_applications['full_app'].to_list()

    job_applicant_similarity = calculate_similarity_to_chunks(list_of_full_apps, jobs_applications['job_description'][0])
    job_applicant_similarity['job_id'] = job
    job_applicant_similarity['application_original_index'] = job_applicant_similarity['chunk_content'].apply(extract_index)
    job_applicant_similarity['similarity_to_job_posting_rank'] = range(1, job_applicant_similarity.shape[0] + 1)

    # Define job_applicant_similarity_reduced
    job_applicant_similarity_reduced = job_applicant_similarity[['application_original_index', 'similarity_score', 'similarity_to_job_posting_rank']]
    
    # Append to the list
    all_job_applicant_similarity_reduced.append(job_applicant_similarity_reduced)

    counter += 1
    print(f"{counter} done!")

# Concatenate all the individual DataFrames into a single large DataFrame
final_job_applicant_similarity_reduced = pd.concat(all_job_applicant_similarity_reduced, ignore_index=True)

final_job_applicant_similarity_reduced

In [None]:
# Displaying the distribution of similarity scores
final_job_applicant_similarity_reduced.application_original_index.describe()

In [None]:
# Merge using the 'original_index' column and 'application_original_index' column
final_applications = applications.merge(final_job_applicant_similarity_reduced, how='left', left_on='original_index', right_on='application_original_index')

# You can drop the 'application_original_index' column if it's no longer needed
final_applications = final_applications.drop(columns=['application_original_index', 'original_index'])

final_applications

In [None]:
applications.to_pickle("opportunities_and_applications.pkl")