In [1]:
import pandas as pd
import sklearn
import numpy as np
from tqdm import tqdm

import spacy
nlp = spacy.load('en_core_web_lg')
import skillNer
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor

import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from functools import lru_cache
## Using SkillNer to extract skills
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

@lru_cache(maxsize=None)  # Unlimited cache
def extract_skills(text):
    annotations = skill_extractor.annotate(text)
    return annotations

# Your list of descriptions
list_of_descs = ["desc1", "desc2", "desc3"]

# Using a set to store unique skills
unique_skills = set()

for desc in list_of_descs:
    skills = extract_skills(desc)
    unique_skills.update(skills)


def clean_extraction(annotations):
    # Initialize an empty list to store the skills
    cleaned_skills = []
    
    # Extract full matches
    for match in annotations['results']['full_matches']:
        try:
            skill = match['doc_node_value']
            score = match['score']
        except:
            skill = 'No skills detected'
            score = 0
        cleaned_skills.append(skill) # (skill, score) could be done as well
    
    # Extract ngram scored matches
    for match in annotations['results']['ngram_scored']:
        try:
            skill = match['doc_node_value']
            score = match['score']
        except:
            skill = 'No skills detected'
            score = 0
        cleaned_skills.append(skill) # (skill, score) could be done as well
    
    return cleaned_skills


# Combine the two functions with exception handling
@lru_cache(maxsize=None)  # Unlimited cache
def full_scan(text):
    try:
        return(clean_extraction(extract_skills(text)))
    except:
        return([('No skills detected', 0)])

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [3]:
apps = pd.read_pickle('../../Data/split_4_with_NER_skills.pkl')

In [4]:
apps.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'similarity_score', 'similarity_rank', 'opportunity_required_skills'],
      dtype='object')

## Adding the skills to the dataframe

In [None]:

# # Pre-allocate an array
# n_rows = len(apps)
# skills_array = np.empty(n_rows, dtype=object)

# # Loop through the DataFrame
# for i, row in enumerate(tqdm(apps.itertuples())):
#     skills_array[i] = full_scan(row.opportunity_description)

# # Create the new column
# apps['opportunity_required_skills'] = skills_array
# apps.rename(columns = {"opportunity_required_skills_test": "opportunity_required_skills"}, inplace = True)
# apps.to_pickle("../../Data/split_4_with_NER_skills.pkl")

In [8]:
n_rows = len(apps)
# Do the same with brief_description_skills
skills_array = np.empty(n_rows, dtype=object)

# Loop through the DataFrame
for i, row in enumerate(tqdm(apps.itertuples())):
    skills_array[i] = full_scan(row.opportunity_brief_description)

# Create the new column
apps['brief_description_skills'] = skills_array

0it [00:00, ?it/s]

  vec_similarity = token1.similarity(token2)
22053it [02:00, 182.79it/s] 


In [9]:
apps

Unnamed: 0,opportunity_id,application_id,opportunity_brief_description,opportunity_description,opportunity_title,application_pass_first_step,application_step_category,application_job_titles,application_job_responsibilities,application_education,application_reported_skills,similarity_score,similarity_rank,opportunity_required_skills,brief_description_skills
0,x3PXnFA1GkCs0Cdz7q83zA==,gICe00fHwUKHC5ROv8WM4g==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,True,1,general manager assistant manager general mana...,30907 interviewed job candidate made staffing ...,High School Diploma++BPA,Sales++Cash++Customer service++Marketing++Rece...,0.928475,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ..."
1,uEY0wW08R0WpiBkds/p4fg==,aaQVNXk5OEqPL7kZJJW6iw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,package handler,waa great job got home late evwnings pay good ...,,,0.920917,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ..."
2,gbzxt0dALU6x83ACFTSGEA==,mggx+MQSvUG6prEI0WkoAA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,logistics supervisor logistics warehouse super...,supervised operation multi shift food grade dc...,Bachelor of Science++Master of Arts,Metrics++Inventory++Training++Operations++Hr++...,0.936904,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ..."
3,Etqs/xDAX0SXrfmQA6+KBQ==,ewtbense/kyY62nuXSljHw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,service tech,diagonose repair vehicle suspension work aswel...,,,0.921125,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ..."
4,lOZhR1k1Kk+OVz6n8r9vwA==,OuoS29IERk+mJTjddJNYSQ==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,security supervisor sharp shooter navigator kr...,patrolling protecting people property informat...,G.E.D,Cpr++Cpr certified++Documenting++Radio communi...,0.938756,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22048,5hdHoZVOPUWJqeggPv5T1Q==,Yi4u0MpbW0CyhqdpFwQvmA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,delivery driver,delivered sold furniture,,,0.151831,14,"[service set, service set, service set, custom...","[pest management, pest management, management ..."
22049,ULIT1Ap/WEeVR6sinqxo9w==,ENwvF9XYsU2yuoI0paNsfg==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,cook,cooking food keeping hot box stocked cleaning ...,Diploma,Driving,0.151831,13,"[service set, service set, service set, custom...","[pest management, pest management, management ..."
22050,oAZjGHneVkKWKsiRaGh8Ag==,+9UVMbL9kUa6VstH0Bzgnw==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,auditor,,Diploma,Confined space training,0.629281,9,"[service set, service set, service set, custom...","[pest management, pest management, management ..."
22051,oAZjGHneVkKWKsiRaGh8Ag==,7RyhIseedUGAVV4lfsKGow==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,machine operator thrower customer service repr...,simple machine operation back truck throwing t...,Diploma,Customer service++Customer service representat...,0.153763,10,"[service set, service set, service set, custom...","[pest management, pest management, management ..."


In [10]:
apps.to_pickle("../../Data/split_4_with_NER_skills.pkl")

In [11]:
def extract_skills(text):
    annotations = skill_extractor.annotate(text)
    return annotations
def full_scan(text):
    try:
        return(clean_extraction(extract_skills(text)))
    except:
        return([('No skills detected', 0)])

skills_array = np.empty(n_rows, dtype=object)

# Loop through the DataFrame
for i, row in enumerate(tqdm(apps.itertuples())):
    skills_array[i] = full_scan(row.application_job_responsibilities)

# Create the new column
apps['application_job_responsibilities_extracted_skills'] = skills_array

#Takes approx. 1 second per iteration. It took 8 hours for 22053k rows

  vec_similarity = token1.similarity(token2)
22053it [8:03:43,  1.32s/it]


In [12]:
apps.to_pickle("../../Data/split_4_with_NER_skills_for_applications.pkl")

In [13]:
apps['application_reported_skills'] = apps['application_reported_skills'].apply(lambda x: x.lower().split('++'))

#make application skills a column which combines the two lists, application_reported_skills and application_job_responsibilities_skills
apps['application_skills'] = apps['application_reported_skills'] + apps['application_job_responsibilities_extracted_skills']

In [14]:
apps

Unnamed: 0,opportunity_id,application_id,opportunity_brief_description,opportunity_description,opportunity_title,application_pass_first_step,application_step_category,application_job_titles,application_job_responsibilities,application_education,application_reported_skills,similarity_score,similarity_rank,opportunity_required_skills,brief_description_skills,application_job_responsibilities_extracted_skills,application_skills
0,x3PXnFA1GkCs0Cdz7q83zA==,gICe00fHwUKHC5ROv8WM4g==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,True,1,general manager assistant manager general mana...,30907 interviewed job candidate made staffing ...,High School Diploma++BPA,"[sales, cash, customer service, marketing, rec...",0.928475,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[food quality, food preparation, information p...","[sales, cash, customer service, marketing, rec..."
1,uEY0wW08R0WpiBkds/p4fg==,aaQVNXk5OEqPL7kZJJW6iw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,package handler,waa great job got home late evwnings pay good ...,,[],0.920917,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...",[],[]
2,gbzxt0dALU6x83ACFTSGEA==,mggx+MQSvUG6prEI0WkoAA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,logistics supervisor logistics warehouse super...,supervised operation multi shift food grade dc...,Bachelor of Science++Master of Arts,"[metrics, inventory, training, operations, hr,...",0.936904,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[warehouse operation, time study, loss prevent...","[metrics, inventory, training, operations, hr,..."
3,Etqs/xDAX0SXrfmQA6+KBQ==,ewtbense/kyY62nuXSljHw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,service tech,diagonose repair vehicle suspension work aswel...,,[],0.921125,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[suspension, claim]","[, suspension, claim]"
4,lOZhR1k1Kk+OVz6n8r9vwA==,OuoS29IERk+mJTjddJNYSQ==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,security supervisor sharp shooter navigator kr...,patrolling protecting people property informat...,G.E.D,"[cpr, cpr certified, documenting, radio commun...",0.938756,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[counter terrorism, patrolling, suspicious act...","[cpr, cpr certified, documenting, radio commun..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22048,5hdHoZVOPUWJqeggPv5T1Q==,Yi4u0MpbW0CyhqdpFwQvmA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,delivery driver,delivered sold furniture,,[],0.151831,14,"[service set, service set, service set, custom...","[pest management, pest management, management ...",[],[]
22049,ULIT1Ap/WEeVR6sinqxo9w==,ENwvF9XYsU2yuoI0paNsfg==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,cook,cooking food keeping hot box stocked cleaning ...,Diploma,[driving],0.151831,13,"[service set, service set, service set, custom...","[pest management, pest management, management ...","[cooking, food]","[driving, cooking, food]"
22050,oAZjGHneVkKWKsiRaGh8Ag==,+9UVMbL9kUa6VstH0Bzgnw==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,auditor,,Diploma,[confined space training],0.629281,9,"[service set, service set, service set, custom...","[pest management, pest management, management ...",[],[confined space training]
22051,oAZjGHneVkKWKsiRaGh8Ag==,7RyhIseedUGAVV4lfsKGow==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,machine operator thrower customer service repr...,simple machine operation back truck throwing t...,Diploma,"[customer service, customer service representa...",0.153763,10,"[service set, service set, service set, custom...","[pest management, pest management, management ...","[machine operation, account, banking, survey]","[customer service, customer service representa..."


In [15]:
# CALCULATE THE JACCARD SIMILARITY BETWEEN THE TWO LISTS
def jaccard_similarity(list1, list2):
    #calculare jaccard similarity
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    if union == 0:
        return 0
    else:
        return float(intersection) / union

#Here is where we take either similarity or intersection or both to find skill matches
apps['jaccard_similarity'] = apps.apply(lambda x: jaccard_similarity(x['opportunity_required_skills'], x['application_skills']), axis=1)
apps

Unnamed: 0,opportunity_id,application_id,opportunity_brief_description,opportunity_description,opportunity_title,application_pass_first_step,application_step_category,application_job_titles,application_job_responsibilities,application_education,application_reported_skills,similarity_score,similarity_rank,opportunity_required_skills,brief_description_skills,application_job_responsibilities_extracted_skills,application_skills,jaccard_similarity
0,x3PXnFA1GkCs0Cdz7q83zA==,gICe00fHwUKHC5ROv8WM4g==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,True,1,general manager assistant manager general mana...,30907 interviewed job candidate made staffing ...,High School Diploma++BPA,"[sales, cash, customer service, marketing, rec...",0.928475,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[food quality, food preparation, information p...","[sales, cash, customer service, marketing, rec...",0.000000
1,uEY0wW08R0WpiBkds/p4fg==,aaQVNXk5OEqPL7kZJJW6iw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,package handler,waa great job got home late evwnings pay good ...,,[],0.920917,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...",[],[],0.000000
2,gbzxt0dALU6x83ACFTSGEA==,mggx+MQSvUG6prEI0WkoAA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,logistics supervisor logistics warehouse super...,supervised operation multi shift food grade dc...,Bachelor of Science++Master of Arts,"[metrics, inventory, training, operations, hr,...",0.936904,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[warehouse operation, time study, loss prevent...","[metrics, inventory, training, operations, hr,...",0.014493
3,Etqs/xDAX0SXrfmQA6+KBQ==,ewtbense/kyY62nuXSljHw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,service tech,diagonose repair vehicle suspension work aswel...,,[],0.921125,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[suspension, claim]","[, suspension, claim]",0.000000
4,lOZhR1k1Kk+OVz6n8r9vwA==,OuoS29IERk+mJTjddJNYSQ==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,security supervisor sharp shooter navigator kr...,patrolling protecting people property informat...,G.E.D,"[cpr, cpr certified, documenting, radio commun...",0.938756,1,"[pest control, service provider, good driving ...","[pest management, pest management, management ...","[counter terrorism, patrolling, suspicious act...","[cpr, cpr certified, documenting, radio commun...",0.014085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22048,5hdHoZVOPUWJqeggPv5T1Q==,Yi4u0MpbW0CyhqdpFwQvmA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,delivery driver,delivered sold furniture,,[],0.151831,14,"[service set, service set, service set, custom...","[pest management, pest management, management ...",[],[],0.000000
22049,ULIT1Ap/WEeVR6sinqxo9w==,ENwvF9XYsU2yuoI0paNsfg==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,cook,cooking food keeping hot box stocked cleaning ...,Diploma,[driving],0.151831,13,"[service set, service set, service set, custom...","[pest management, pest management, management ...","[cooking, food]","[driving, cooking, food]",0.000000
22050,oAZjGHneVkKWKsiRaGh8Ag==,+9UVMbL9kUa6VstH0Bzgnw==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,auditor,,Diploma,[confined space training],0.629281,9,"[service set, service set, service set, custom...","[pest management, pest management, management ...",[],[confined space training],0.000000
22051,oAZjGHneVkKWKsiRaGh8Ag==,7RyhIseedUGAVV4lfsKGow==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,machine operator thrower customer service repr...,simple machine operation back truck throwing t...,Diploma,"[customer service, customer service representa...",0.153763,10,"[service set, service set, service set, custom...","[pest management, pest management, management ...","[machine operation, account, banking, survey]","[customer service, customer service representa...",0.012987


In [16]:
apps.jaccard_similarity.describe()

count    22053.000000
mean         0.013465
std          0.014880
min          0.000000
25%          0.000000
50%          0.011236
75%          0.022727
max          0.185185
Name: jaccard_similarity, dtype: float64

### Even the highest jaccard similarity is 0.5, which is not very high.

In [20]:
import plotly.express as px
fig = px.histogram(apps, x="jaccard_similarity", title='Jaccard Similarity of Skills between Applications and Opportunities')
fig.show()

In [22]:
# Function to find the intersection of skills between two lists
def find_intersection(skills1, skills2):
    return list(set(skills1).intersection(set(skills2)))

# Create a new column that contains the intersection of skills between the two columns
apps['intersected_skills'] = apps.apply(lambda row: find_intersection(row['opportunity_required_skills'], row['application_skills']), axis=1)
apps['intersected_skills_count'] = apps['intersected_skills'].apply(lambda x: len(x))
apps['intersected_skills_count'].describe()

count    22053.000000
mean         1.452818
std          1.853443
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max         17.000000
Name: intersected_skills_count, dtype: float64

In [37]:
for column in apps.columns:
    if '_skills' in column and '_count' not in column:
        apps[column + '_list_length'] = apps[column].apply(lambda x: len(x))
apps

Unnamed: 0,opportunity_id,application_id,opportunity_brief_description,opportunity_description,opportunity_title,application_pass_first_step,application_step_category,application_job_titles,application_job_responsibilities,application_education,...,application_skills,jaccard_similarity,intersected_skills,intersected_skills_count,application_reported_skills_list_length,opportunity_required_skills_list_length,brief_description_skills_list_length,application_job_responsibilities_extracted_skills_list_length,application_skills_list_length,intersected_skills_list_length
0,x3PXnFA1GkCs0Cdz7q83zA==,gICe00fHwUKHC5ROv8WM4g==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,True,1,general manager assistant manager general mana...,30907 interviewed job candidate made staffing ...,High School Diploma++BPA,...,"[sales, cash, customer service, marketing, rec...",0.000000,[],0,24,61,5,42,66,0
1,uEY0wW08R0WpiBkds/p4fg==,aaQVNXk5OEqPL7kZJJW6iw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,package handler,waa great job got home late evwnings pay good ...,,...,[],0.000000,[],0,1,61,5,0,1,0
2,gbzxt0dALU6x83ACFTSGEA==,mggx+MQSvUG6prEI0WkoAA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,logistics supervisor logistics warehouse super...,supervised operation multi shift food grade dc...,Bachelor of Science++Master of Arts,...,"[metrics, inventory, training, operations, hr,...",0.014493,"[operation, position]",2,33,61,5,46,79,2
3,Etqs/xDAX0SXrfmQA6+KBQ==,ewtbense/kyY62nuXSljHw==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,service tech,diagonose repair vehicle suspension work aswel...,,...,"[, suspension, claim]",0.000000,[],0,1,61,5,2,3,0
4,lOZhR1k1Kk+OVz6n8r9vwA==,OuoS29IERk+mJTjddJNYSQ==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,termite technician,False,0,security supervisor sharp shooter navigator kr...,patrolling protecting people property informat...,G.E.D,...,"[cpr, cpr certified, documenting, radio commun...",0.014085,[operation],1,6,61,5,5,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22048,5hdHoZVOPUWJqeggPv5T1Q==,Yi4u0MpbW0CyhqdpFwQvmA==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,delivery driver,delivered sold furniture,,...,[],0.000000,[],0,1,62,5,0,1,0
22049,ULIT1Ap/WEeVR6sinqxo9w==,ENwvF9XYsU2yuoI0paNsfg==,orkin purpose help protect world live work pla...,orkin purpose help protect world live work pla...,pest control specialist,False,0,cook,cooking food keeping hot box stocked cleaning ...,Diploma,...,"[driving, cooking, food]",0.000000,[],0,1,62,5,2,3,0
22050,oAZjGHneVkKWKsiRaGh8Ag==,+9UVMbL9kUa6VstH0Bzgnw==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,auditor,,Diploma,...,[confined space training],0.000000,[],0,1,62,5,0,1,0
22051,oAZjGHneVkKWKsiRaGh8Ag==,7RyhIseedUGAVV4lfsKGow==,16 00 per hour orkin purpose help protect worl...,16 00 per hour orkin purpose help protect worl...,service technician train,True,0,machine operator thrower customer service repr...,simple machine operation back truck throwing t...,Diploma,...,"[customer service, customer service representa...",0.012987,[customer service],1,12,62,5,4,16,1


In [39]:
#save apps to pickle and parquet
apps.to_pickle("../../Data/split_4_enriched.pkl")
#apps.to_parquet("../../Data/split_4_enriched.parquet")


In [40]:
apps.columns

Index(['opportunity_id', 'application_id', 'opportunity_brief_description',
       'opportunity_description', 'opportunity_title',
       'application_pass_first_step', 'application_step_category',
       'application_job_titles', 'application_job_responsibilities',
       'application_education', 'application_reported_skills',
       'similarity_score', 'similarity_rank', 'opportunity_required_skills',
       'brief_description_skills',
       'application_job_responsibilities_extracted_skills',
       'application_skills', 'jaccard_similarity', 'intersected_skills',
       'intersected_skills_count', 'application_reported_skills_list_length',
       'opportunity_required_skills_list_length',
       'brief_description_skills_list_length',
       'application_job_responsibilities_extracted_skills_list_length',
       'application_skills_list_length', 'intersected_skills_list_length'],
      dtype='object')

In [44]:
# This column takes the list of applicant job titles and checks if the opportunity title is in the list
# If someone has held ____ role before, reasonably they'd have an increased likelihood of being a good fit for the role
apps['applicant_has_held_role_before'] = apps.apply(lambda x: x['opportunity_title'] in x['application_job_titles'], axis=1)
apps.applicant_has_held_role_before.value_counts()
# Ethical consideration: this could be used to discriminate against people who have not held the role before
# Gatekeeping could be a problem here.

applicant_has_held_role_before
False    21698
True       355
Name: count, dtype: int64

In [45]:
apps.drop(columns = ['applicant_has_held_role_before'], inplace = True)

# What we have now -
- Cleaned up data with the results of their applications (We know when people have gotten past the resume stage).
- We have skills extracted from opportunities and the applications made to them (as well as their lengths and intersections)
- We have cosine simialrity using BERT and the corresponding semantic ranking (based on cosine similarity)

# What we should do
- Quick Test. Find intersection of Job Titles (for a given application, has the applicant held that position before?)✅
- <b>Define the parameters. What inputs will our model take in, and what do the JSON outputs look like? (few hours)</b>
- Make a big ol program with our functions to get us where we need to go.
- Streamlit app using the functions. Small, but it's a Minimum Viable Product (1d)

# Once we have a working, deployed model:
- Fine-tune BERT on our data
- The Job Title Normalization is a project on its own, and could lead to additional alignment of skills.
- Does the opportunity ask for a degree? Does the applicant have that degree.