# Notebook to map job title and description with occupation and competency

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import spacy

In [2]:
# Download pretrained enlgish model
try:
    import en_core_web_md
except:
    !python -m spacy download en_core_web_md
    import en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.7 MB/s eta 0:00:011
Installing collected packages: en-core-web-md
    Running setup.py install for en-core-web-md ... [?25ldone
[?25hSuccessfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [3]:
# Read sqlite query results into a pandas DataFrame
home_path = "../../"
con = sqlite3.connect(home_path+"collectors/data.sqlite3")
job_df = pd.read_sql_query("SELECT * from job_post", con)
con.close()

In [4]:
# Verify that result of SQL query is stored in the dataframe
job_df.head()

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist


In [5]:
job_titles = job_df['title'].to_list()

In [6]:
job_titles = [title.lower() for title in job_titles]

In [7]:
nlp = en_core_web_md.load()

In [8]:
# Preprocess the text
def process_text(text):
    doc = nlp(text)
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        # result.append(token.lemma_)
        result.append(token.text)
    return " ".join(result)
    # return result

In [9]:
# Process every document
for i in range(len(job_titles)):
    job_titles[i] = process_text(job_titles[i])

In [10]:
job_titles[0:5]

['data scientist',
 'business intelligence analyst',
 'human resources data scientist',
 'lead human resource data scientist',
 'machine learning engineer']

In [11]:
onet_competencies = home_path+'datasets/competencies.csv'
onet_df = pd.read_csv(onet_competencies,index_col=0)
onet_df[onet_df['occupation'] == 'Computer and Information Research Scientists'].head(10)

Unnamed: 0,occupation,competency,category,description
0,Computer and Information Research Scientists,Source code management SCM software,Technology Skills,Development environment software
1,Computer and Information Research Scientists,Microsoft Azure,Technology Skills,Development environment software
2,Computer and Information Research Scientists,Visualization,Abilities,The ability to imagine how something will look...
3,Computer and Information Research Scientists,Free-field speakers,Tools Used,Loudspeakers
4,Computer and Information Research Scientists,Data visualization software,Technology Skills,Analytical or scientific software
5,Computer and Information Research Scientists,Judgment and Decision Making,Skills,Considering the relative costs and benefits of...
6,Computer and Information Research Scientists,Linux,Technology Skills,Operating system software
7,Computer and Information Research Scientists,IBM Rational Apex,Technology Skills,Configuration management software
8,Computer and Information Research Scientists,Minitab,Technology Skills,Analytical or scientific software
9,Computer and Information Research Scientists,Evaluate project plans and proposals to assess...,Task Statements,Core


In [12]:
onet_occupations = home_path+'datasets/occupations.csv'
onet_oc_df = pd.read_csv(onet_occupations,index_col=0)
onet_oc_df.head(10)

Unnamed: 0,identifier,name,description,titles
0,15-1111.00,Computer and Information Research Scientists,Conduct research into fundamental computer and...,Artificial Intelligence Specialist (AI Special...
1,19-4061.00,Social Science Research Assistants,"Assist social scientists in laboratory, survey...","Bilingual Research Interviewer,Clinical Resear..."
2,19-2099.01,Remote Sensing Scientists and Technologists,Apply remote sensing principles and methods to...,"All Source Intelligence Analyst,Data Analytics..."
3,19-1029.01,Bioinformatics Scientists,Conduct research using bioinformatics theory a...,"Assistant Scientist,Bioinformatician,Bioinform..."
4,15-1199.04,Geospatial Information Scientists and Technolo...,Research or develop geospatial technologies. M...,"Geographic Information Scientist,Geographic In..."
5,19-3022.00,Survey Researchers,"Plan, develop, or conduct surveys. May analyze...","Data Analyst,Data Collection Specialist,Field ..."
6,15-2041.00,Statisticians,Develop or apply mathematical or statistical t...,"Analytical Statistician,Applied Scientist,Appl..."
7,15-1121.00,Computer Systems Analysts,"Analyze science, engineering, business, and ot...","Applications Analyst,Applications Systems Anal..."
8,15-2021.00,Mathematicians,Conduct research in fundamental mathematics or...,"Agent-Based Modeler,Algebraist,Applied Mathema..."
9,15-1133.00,"Software Developers, Systems Software","Research, design, develop, and test operating ...","Applications Analyst,Automation Engineer,Beta ..."


In [13]:
voc_bigrams = home_path+'datasets/top_bigrams.csv'
voc_bigrams_df = pd.read_csv(voc_bigrams)
top_bigrams = voc_bigrams_df['description'].unique()
top_bigrams

array(['machine learning', 'data science', 'data scientist',
       'computer science', 'data analytics', 'deep learning',
       'experience working', 'big data', 'des données',
       'advanced analytics', 'years experience', 'data sets',
       'data analysis', 'communication skills', 'data scientists',
       'experience data', 'problem solving', 'data driven',
       'business intelligence', 'data visualization', 'data engineer',
       'data pipelines', 'data engineering', 'equal opportunity',
       'software development', 'opportunity employer', 'lehigh hanson',
       'bachelor degree', 'degree computer', 'data analyst',
       'ability work', 'et des', 'business analyst', 'business units',
       'related field', 'fast paced', 'et la', 'skills ability', 'et les',
       'job types'], dtype=object)

In [22]:
title_occupation = job_df.copy()
title_occupation['title_processed'] = job_titles
title_occupation['keyword_processed'] = title_occupation.apply(lambda row: [bigram for bigram in top_bigrams if bigram in row.description], axis=1)
title_occupation['title_processed'] = title_occupation.apply(lambda row: '{} {}'.format(row.title_processed, ' '.join(row.keyword_processed)), axis=1)
title_occupation.head(5)

Unnamed: 0,id,title,company,location,description,source,search_kw,title_processed,keyword_processed
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist,data scientist machine learning data science c...,"[machine learning, data science, computer scie..."
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist,business intelligence analyst data science dat...,"[data science, data analytics, years experienc..."
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist,human resources data scientist machine learnin...,"[machine learning, data science, computer scie..."
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist,lead human resource data scientist machine lea...,"[machine learning, data science, computer scie..."
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist,machine learning engineer machine learning dat...,"[machine learning, data science, deep learning..."


In [24]:
title_occupation = job_df.copy()
title_occupation['title_processed'] = job_titles

for nums in range(0, len(top_bigrams) + 1, 5):
    title_occupation['keyword_processed_'+str(nums)] = title_occupation.apply(lambda row: [bigram for bigram in top_bigrams[:nums] if bigram in row.description], axis=1)
    title_occupation['title_processed_'+str(nums)] = title_occupation.apply(lambda row: '{} {}'.format(row.title_processed, ' '.join(row['keyword_processed_'+str(nums)])), axis=1)
title_occupation.head(5)

Unnamed: 0,id,title,company,location,description,source,search_kw,title_processed,keyword_processed_0,title_processed_0,...,keyword_processed_20,title_processed_20,keyword_processed_25,title_processed_25,keyword_processed_30,title_processed_30,keyword_processed_35,title_processed_35,keyword_processed_40,title_processed_40
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist,data scientist,[],data scientist,...,"[machine learning, data science, computer scie...",data scientist machine learning data science c...,"[machine learning, data science, computer scie...",data scientist machine learning data science c...,"[machine learning, data science, computer scie...",data scientist machine learning data science c...,"[machine learning, data science, computer scie...",data scientist machine learning data science c...,"[machine learning, data science, computer scie...",data scientist machine learning data science c...
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist,business intelligence analyst,[],business intelligence analyst,...,"[data science, data analytics, years experienc...",business intelligence analyst data science dat...,"[data science, data analytics, years experienc...",business intelligence analyst data science dat...,"[data science, data analytics, years experienc...",business intelligence analyst data science dat...,"[data science, data analytics, years experienc...",business intelligence analyst data science dat...,"[data science, data analytics, years experienc...",business intelligence analyst data science dat...
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist,human resources data scientist,[],human resources data scientist,...,"[machine learning, data science, computer scie...",human resources data scientist machine learnin...,"[machine learning, data science, computer scie...",human resources data scientist machine learnin...,"[machine learning, data science, computer scie...",human resources data scientist machine learnin...,"[machine learning, data science, computer scie...",human resources data scientist machine learnin...,"[machine learning, data science, computer scie...",human resources data scientist machine learnin...
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist,lead human resource data scientist,[],lead human resource data scientist,...,"[machine learning, data science, computer scie...",lead human resource data scientist machine lea...,"[machine learning, data science, computer scie...",lead human resource data scientist machine lea...,"[machine learning, data science, computer scie...",lead human resource data scientist machine lea...,"[machine learning, data science, computer scie...",lead human resource data scientist machine lea...,"[machine learning, data science, computer scie...",lead human resource data scientist machine lea...
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist,machine learning engineer,[],machine learning engineer,...,"[machine learning, data science, deep learning...",machine learning engineer machine learning dat...,"[machine learning, data science, deep learning...",machine learning engineer machine learning dat...,"[machine learning, data science, deep learning...",machine learning engineer machine learning dat...,"[machine learning, data science, deep learning...",machine learning engineer machine learning dat...,"[machine learning, data science, deep learning...",machine learning engineer machine learning dat...


In [25]:
# Process every document
occupation_titles =  onet_oc_df['titles'].to_list()
for i in range(len(occupation_titles)):
    occupation_titles[i] = [process_text(title.lower()) for title in occupation_titles[i].split(',')]

In [26]:
onet_oc_df['titles_processed'] = occupation_titles

In [27]:
onet_oc_df.head(10)

Unnamed: 0,identifier,name,description,titles,titles_processed
0,15-1111.00,Computer and Information Research Scientists,Conduct research into fundamental computer and...,Artificial Intelligence Specialist (AI Special...,[artificial intelligence specialist ai special...
1,19-4061.00,Social Science Research Assistants,"Assist social scientists in laboratory, survey...","Bilingual Research Interviewer,Clinical Resear...","[bilingual research interviewer, clinical rese..."
2,19-2099.01,Remote Sensing Scientists and Technologists,Apply remote sensing principles and methods to...,"All Source Intelligence Analyst,Data Analytics...","[source intelligence analyst, data analytics c..."
3,19-1029.01,Bioinformatics Scientists,Conduct research using bioinformatics theory a...,"Assistant Scientist,Bioinformatician,Bioinform...","[assistant scientist, bioinformatician, bioinf..."
4,15-1199.04,Geospatial Information Scientists and Technolo...,Research or develop geospatial technologies. M...,"Geographic Information Scientist,Geographic In...","[geographic information scientist, geographic ..."
5,19-3022.00,Survey Researchers,"Plan, develop, or conduct surveys. May analyze...","Data Analyst,Data Collection Specialist,Field ...","[data analyst, data collection specialist, fie..."
6,15-2041.00,Statisticians,Develop or apply mathematical or statistical t...,"Analytical Statistician,Applied Scientist,Appl...","[analytical statistician, applied scientist, a..."
7,15-1121.00,Computer Systems Analysts,"Analyze science, engineering, business, and ot...","Applications Analyst,Applications Systems Anal...","[applications analyst, applications systems an..."
8,15-2021.00,Mathematicians,Conduct research in fundamental mathematics or...,"Agent-Based Modeler,Algebraist,Applied Mathema...","[agent based modeler, algebraist, applied math..."
9,15-1133.00,"Software Developers, Systems Software","Research, design, develop, and test operating ...","Applications Analyst,Automation Engineer,Beta ...","[applications analyst, automation engineer, be..."


## Two methods to calculate the similarity
### 1. Calculate job title similarity with all the alternate titles for each occupation in ONet
### 2. Calculate job title similarity with each alternate title for each occupation in ONet, and store maximum similarty score valued title

In [35]:
import itertools

# title_names = ['title_processed_'+str(nums) for nums in range(0,41,5)]
# df1 = title_occupation[['id', 'title'] + title_names]

score_df = []

for nums in range(0, 41, 5):
    df1 = title_occupation[['id', 'title', 'title_processed_'+str(nums)]]
    df2 = onet_oc_df[['identifier', 'titles_processed']]

    df_vals = list(itertools.product(df1.values.tolist(),df2.values.tolist()))
    colnames = list(df1.columns) + list(df2.columns)
    score_df.append(pd.DataFrame(list(map(lambda x : sum(x,[]), df_vals)), columns=colnames))
#     score_df.head(10)
score_df[5].head(10)

Unnamed: 0,id,title,title_processed_25,identifier,titles_processed
0,1,Data Scientist,data scientist machine learning data science c...,15-1111.00,[artificial intelligence specialist ai special...
1,1,Data Scientist,data scientist machine learning data science c...,19-4061.00,"[bilingual research interviewer, clinical rese..."
2,1,Data Scientist,data scientist machine learning data science c...,19-2099.01,"[source intelligence analyst, data analytics c..."
3,1,Data Scientist,data scientist machine learning data science c...,19-1029.01,"[assistant scientist, bioinformatician, bioinf..."
4,1,Data Scientist,data scientist machine learning data science c...,15-1199.04,"[geographic information scientist, geographic ..."
5,1,Data Scientist,data scientist machine learning data science c...,19-3022.00,"[data analyst, data collection specialist, fie..."
6,1,Data Scientist,data scientist machine learning data science c...,15-2041.00,"[analytical statistician, applied scientist, a..."
7,1,Data Scientist,data scientist machine learning data science c...,15-1121.00,"[applications analyst, applications systems an..."
8,1,Data Scientist,data scientist machine learning data science c...,15-2021.00,"[agent based modeler, algebraist, applied math..."
9,1,Data Scientist,data scientist machine learning data science c...,15-1133.00,"[applications analyst, automation engineer, be..."


In [37]:
# Method 1 - with all alternate titles together

for nums in range(0, 41, 5):
    df1 = title_occupation[['id', 'title', 'title_processed_'+str(nums)]]
    nlp_title = np.array([nlp(title) for title in df1['title_processed_'+str(nums)]])
    nlp_occupation_title = np.array([nlp(' '.join(title)) for title in df2['titles_processed']])
    scores = np.array([title.similarity(occupation_title) \
                      for title in nlp_title \
                      for occupation_title in nlp_occupation_title])
    score_df[int(nums/5)]['score_all'] = scores

In [38]:
# score_df['score_all'] = scores
score_df[5].head(10)

Unnamed: 0,id,title,title_processed_25,identifier,titles_processed,score_all
0,1,Data Scientist,data scientist machine learning data science c...,15-1111.00,[artificial intelligence specialist ai special...,0.883895
1,1,Data Scientist,data scientist machine learning data science c...,19-4061.00,"[bilingual research interviewer, clinical rese...",0.711012
2,1,Data Scientist,data scientist machine learning data science c...,19-2099.01,"[source intelligence analyst, data analytics c...",0.81273
3,1,Data Scientist,data scientist machine learning data science c...,19-1029.01,"[assistant scientist, bioinformatician, bioinf...",0.743133
4,1,Data Scientist,data scientist machine learning data science c...,15-1199.04,"[geographic information scientist, geographic ...",0.838989
5,1,Data Scientist,data scientist machine learning data science c...,19-3022.00,"[data analyst, data collection specialist, fie...",0.749849
6,1,Data Scientist,data scientist machine learning data science c...,15-2041.00,"[analytical statistician, applied scientist, a...",0.820429
7,1,Data Scientist,data scientist machine learning data science c...,15-1121.00,"[applications analyst, applications systems an...",0.818415
8,1,Data Scientist,data scientist machine learning data science c...,15-2021.00,"[agent based modeler, algebraist, applied math...",0.789097
9,1,Data Scientist,data scientist machine learning data science c...,15-1133.00,"[applications analyst, automation engineer, be...",0.766679


In [40]:
for i in range(len(score_df)):
    score_df[i] = score_df[i].sort_values('score_all', ascending=False).drop_duplicates(['id'])
    score_df[i] = score_df[i].sort_values('id')

score_df[5].head(10)

Unnamed: 0,id,title,title_processed_25,identifier,titles_processed,score_all
0,1,Data Scientist,data scientist machine learning data science c...,15-1111.00,[artificial intelligence specialist ai special...,0.883895
33,2,Business Intelligence Analyst,business intelligence analyst data science dat...,15-1199.08,"[analytical data miner, business analyst, busi...",0.867841
38,3,Human Resources Data Scientist,human resources data scientist machine learnin...,15-1111.00,[artificial intelligence specialist ai special...,0.850638
57,4,Lead - Human Resource Data Scientist,lead human resource data scientist machine lea...,15-1111.00,[artificial intelligence specialist ai special...,0.869649
76,5,Machine Learning Engineer,machine learning engineer machine learning dat...,15-1111.00,[artificial intelligence specialist ai special...,0.847708
95,6,Data Scientist,data scientist machine learning data science y...,15-1111.00,[artificial intelligence specialist ai special...,0.86595
114,7,Associate Data Scientist,associate data scientist machine learning data...,15-1111.00,[artificial intelligence specialist ai special...,0.872069
133,8,Data Scientist I,data scientist machine learning,15-1111.00,[artificial intelligence specialist ai special...,0.884682
152,9,"Data Scientist, AI@Unity",data scientist ai@unity machine learning data ...,15-1111.00,[artificial intelligence specialist ai special...,0.887298
171,10,Data Scientist,data scientist machine learning experience wor...,15-1111.00,[artificial intelligence specialist ai special...,0.847733


In [41]:
get_occupation = []

for nums in range(0, 41, 5):
    df1 = score_df[int(nums/5)].drop(['titles_processed', 'title_processed_'+str(nums)], axis=1)
    df2 = onet_oc_df[['identifier', 'name']].drop_duplicates()

    get_occupation.append(df1.merge(df2, on='identifier', how='left'))
get_occupation[5].head(5)

Unnamed: 0,id,title,identifier,score_all,name
0,1,Data Scientist,15-1111.00,0.883895,Computer and Information Research Scientists
1,2,Business Intelligence Analyst,15-1199.08,0.867841,Business Intelligence Analysts
2,3,Human Resources Data Scientist,15-1111.00,0.850638,Computer and Information Research Scientists
3,4,Lead - Human Resource Data Scientist,15-1111.00,0.869649,Computer and Information Research Scientists
4,5,Machine Learning Engineer,15-1111.00,0.847708,Computer and Information Research Scientists


In [23]:
get_occupation[~get_occupation['title'].str.contains('data scientist', case=False, regex=False)].head(50)

Unnamed: 0,id,title,identifier,score_all,name
1,2,Business Intelligence Analyst,15-1199.08,0.866399,Business Intelligence Analysts
4,5,Machine Learning Engineer,15-1111.00,0.847708,Computer and Information Research Scientists
12,13,Jr. Data Science Developer,15-1111.00,0.888068,Computer and Information Research Scientists
17,18,"Manager of Data Science, RACE21 - Vancouver",15-1111.00,0.902421,Computer and Information Research Scientists
19,20,"Lead Educator, Data Science",15-1111.00,0.824635,Computer and Information Research Scientists
20,21,Jr. Data Science Developer,15-1111.00,0.888068,Computer and Information Research Scientists
23,24,Applied Scientist,15-1111.00,0.844633,Computer and Information Research Scientists
25,26,Mine Planning Engineer,15-1199.02,0.800736,Computer Systems Engineers/Architects
26,27,Human Resources Administrator,15-1141.00,0.77896,Database Administrators
27,28,"Associate Director, Engineering Admissions",19-4061.00,0.840551,Social Science Research Assistants


In [59]:
# Save file
for nums in range(0, 41, 5):
    get_occupation[int(nums/5)].to_csv(home_path+'results/ngrams/title_occupation_' + str(nums) + '.csv', index=False)