# Notebook to map job title and description with occupation and competency

In [2]:
import numpy as np
import pandas as pd
import sqlite3
import spacy

In [3]:
# Download pretrained enlgish model
try:
    import en_core_web_sm
except:
    !python -m spacy download en_core_web_sm
    import en_core_web_sm

In [4]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("collectors/data.sqlite3")
job_df = pd.read_sql_query("SELECT * from job_post", con)
con.close()

In [5]:
# Verify that result of SQL query is stored in the dataframe
job_df.head()

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist


In [5]:
job_titles = job_df['title'].to_list()

In [6]:
job_titles = [title.lower() for title in job_titles]

In [7]:
nlp = en_core_web_sm.load()

In [8]:
# Preprocess the text
def process_text(text):
    doc = nlp(text)
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        # result.append(token.lemma_)
        result.append(token.text)
    return " ".join(result)
    # return result

In [9]:
# Process every document
for i in range(len(job_titles)):
    job_titles[i] = process_text(job_titles[i])

In [10]:
job_titles[0:5]

['data scientist',
 'business intelligence analyst',
 'human resources data scientist',
 'lead human resource data scientist',
 'machine learning engineer']

In [48]:
onet_competencies = 'datasets/competencies.csv'
onet_df = pd.read_csv(onet_competencies,index_col=0)
onet_df[onet_df['occupation'] == 'Computer and Information Research Scientists'].head(10)

Unnamed: 0,occupation,competency,category,description
0,Computer and Information Research Scientists,Source code management SCM software,Technology Skills,Development environment software
1,Computer and Information Research Scientists,Microsoft Azure,Technology Skills,Development environment software
2,Computer and Information Research Scientists,Visualization,Abilities,The ability to imagine how something will look...
3,Computer and Information Research Scientists,Free-field speakers,Tools Used,Loudspeakers
4,Computer and Information Research Scientists,Data visualization software,Technology Skills,Analytical or scientific software
5,Computer and Information Research Scientists,Judgment and Decision Making,Skills,Considering the relative costs and benefits of...
6,Computer and Information Research Scientists,Linux,Technology Skills,Operating system software
7,Computer and Information Research Scientists,IBM Rational Apex,Technology Skills,Configuration management software
8,Computer and Information Research Scientists,Minitab,Technology Skills,Analytical or scientific software
9,Computer and Information Research Scientists,Evaluate project plans and proposals to assess...,Task Statements,Core


In [49]:
onet_occupations = 'datasets/occupations.csv'
onet_oc_df = pd.read_csv(onet_occupations,index_col=0)
onet_oc_df.head(10)

Unnamed: 0,identifier,name,description,titles
0,15-1111.00,Computer and Information Research Scientists,Conduct research into fundamental computer and...,Artificial Intelligence Specialist (AI Special...
1,19-4061.00,Social Science Research Assistants,"Assist social scientists in laboratory, survey...","Bilingual Research Interviewer,Clinical Resear..."
2,19-2099.01,Remote Sensing Scientists and Technologists,Apply remote sensing principles and methods to...,"All Source Intelligence Analyst,Data Analytics..."
3,19-1029.01,Bioinformatics Scientists,Conduct research using bioinformatics theory a...,"Assistant Scientist,Bioinformatician,Bioinform..."
4,15-1199.04,Geospatial Information Scientists and Technolo...,Research or develop geospatial technologies. M...,"Geographic Information Scientist,Geographic In..."
5,19-3022.00,Survey Researchers,"Plan, develop, or conduct surveys. May analyze...","Data Analyst,Data Collection Specialist,Field ..."
6,15-2041.00,Statisticians,Develop or apply mathematical or statistical t...,"Analytical Statistician,Applied Scientist,Appl..."
7,15-1121.00,Computer Systems Analysts,"Analyze science, engineering, business, and ot...","Applications Analyst,Applications Systems Anal..."
8,15-2021.00,Mathematicians,Conduct research in fundamental mathematics or...,"Agent-Based Modeler,Algebraist,Applied Mathema..."
9,15-1133.00,"Software Developers, Systems Software","Research, design, develop, and test operating ...","Applications Analyst,Automation Engineer,Beta ..."


In [84]:
title_occupation = job_df.copy()
title_occupation['title_processed'] = job_titles
title_occupation.head(10)

Unnamed: 0,id,title,company,location,description,source,search_kw,title_processed
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist,business intelligence analyst
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist,human resources data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist,lead human resource data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist,machine learning engineer
5,6,Data Scientist,BrainStation,"Vancouver, BC",BrainStation is a global leader in digital ski...,indeed.com,data scientist,data scientist
6,7,Associate Data Scientist,TrainTurf Academic Solutions Limited,"Vancouver, BC",We are looking for a Data Scientist to support...,indeed.com,data scientist,associate data scientist
7,8,Data Scientist I,"AMZN CAN Fulfillment Svcs, ULC","Vancouver, BC","Master or PhD in Computer Science, Machine Lea...",indeed.com,data scientist,data scientist
8,9,"Data Scientist, AI@Unity",Unity Technologies,"Vancouver, BC",Data is the foundation of our business in AI @...,indeed.com,data scientist,data scientist ai@unity
9,10,Data Scientist,Providence Health Care,"Vancouver, BC","Reporting to the Technical Manager, Digital Pr...",indeed.com,data scientist,data scientist


In [88]:
# Process every document
occupation_titles =  onet_oc_df['titles'].to_list()
for i in range(len(occupation_titles)):
    occupation_titles[i] = [process_text(title.lower()) for title in occupation_titles[i].split(',')]

In [89]:
onet_oc_df['titles_processed'] = occupation_titles

In [90]:
onet_oc_df.head(10)

Unnamed: 0,identifier,name,description,titles,titles_processed
0,15-1111.00,Computer and Information Research Scientists,Conduct research into fundamental computer and...,Artificial Intelligence Specialist (AI Special...,[artificial intelligence specialist ai special...
1,19-4061.00,Social Science Research Assistants,"Assist social scientists in laboratory, survey...","Bilingual Research Interviewer,Clinical Resear...","[bilingual research interviewer, clinical rese..."
2,19-2099.01,Remote Sensing Scientists and Technologists,Apply remote sensing principles and methods to...,"All Source Intelligence Analyst,Data Analytics...","[source intelligence analyst, data analytics c..."
3,19-1029.01,Bioinformatics Scientists,Conduct research using bioinformatics theory a...,"Assistant Scientist,Bioinformatician,Bioinform...","[assistant scientist, bioinformatician, bioinf..."
4,15-1199.04,Geospatial Information Scientists and Technolo...,Research or develop geospatial technologies. M...,"Geographic Information Scientist,Geographic In...","[geographic information scientist, geographic ..."
5,19-3022.00,Survey Researchers,"Plan, develop, or conduct surveys. May analyze...","Data Analyst,Data Collection Specialist,Field ...","[data analyst, data collection specialist, fie..."
6,15-2041.00,Statisticians,Develop or apply mathematical or statistical t...,"Analytical Statistician,Applied Scientist,Appl...","[analytical statistician, applied scientist, a..."
7,15-1121.00,Computer Systems Analysts,"Analyze science, engineering, business, and ot...","Applications Analyst,Applications Systems Anal...","[applications analyst, applications systems an..."
8,15-2021.00,Mathematicians,Conduct research in fundamental mathematics or...,"Agent-Based Modeler,Algebraist,Applied Mathema...","[agent based modeler, algebraist, applied math..."
9,15-1133.00,"Software Developers, Systems Software","Research, design, develop, and test operating ...","Applications Analyst,Automation Engineer,Beta ...","[applications analyst, automation engineer, be..."


## Two methods to calculate the similarity
### 1. Calculate job title similarity with all the alternate titles for each occupation in ONet
### 2. Calculate job title similarity with each alternate title for each occupation in ONet, and store maximum similarty score valued title

In [272]:
import itertools

df1 = title_occupation[['id', 'title_processed']]
df2 = onet_oc_df[['identifier', 'titles_processed']]

df_vals = list(itertools.product(df1.values.tolist(),df2.values.tolist()))
colnames = list(df1.columns) + list(df2.columns)
score_df = pd.DataFrame(list(map(lambda x : sum(x,[]), df_vals)), columns=colnames)
score_df.head(10)

Unnamed: 0,id,title_processed,identifier,titles_processed
0,1,data scientist,15-1111.00,[artificial intelligence specialist ai special...
1,1,data scientist,19-4061.00,"[bilingual research interviewer, clinical rese..."
2,1,data scientist,19-2099.01,"[source intelligence analyst, data analytics c..."
3,1,data scientist,19-1029.01,"[assistant scientist, bioinformatician, bioinf..."
4,1,data scientist,15-1199.04,"[geographic information scientist, geographic ..."
5,1,data scientist,19-3022.00,"[data analyst, data collection specialist, fie..."
6,1,data scientist,15-2041.00,"[analytical statistician, applied scientist, a..."
7,1,data scientist,15-1121.00,"[applications analyst, applications systems an..."
8,1,data scientist,15-2021.00,"[agent based modeler, algebraist, applied math..."
9,1,data scientist,15-1133.00,"[applications analyst, automation engineer, be..."


In [219]:
# Method 1 - with all alternate titles together

nlp_title = np.array([nlp(title) for title in df1['title_processed']])
nlp_occupation_title = np.array([nlp(' '.join(title)) for title in df2['titles_processed']])
scores = np.array([title.similarity(occupation_title) \
                  for title in nlp_title \
                  for occupation_title in nlp_occupation_title])

In [274]:
score_df['score_all'] = scores
score_df.head(10)

Unnamed: 0,id,title_processed,identifier,titles_processed,score_all
0,1,data scientist,15-1111.00,[artificial intelligence specialist ai special...,0.749518
1,1,data scientist,19-4061.00,"[bilingual research interviewer, clinical rese...",0.615939
2,1,data scientist,19-2099.01,"[source intelligence analyst, data analytics c...",0.735846
3,1,data scientist,19-1029.01,"[assistant scientist, bioinformatician, bioinf...",0.712033
4,1,data scientist,15-1199.04,"[geographic information scientist, geographic ...",0.70809
5,1,data scientist,19-3022.00,"[data analyst, data collection specialist, fie...",0.683782
6,1,data scientist,15-2041.00,"[analytical statistician, applied scientist, a...",0.729251
7,1,data scientist,15-1121.00,"[applications analyst, applications systems an...",0.714373
8,1,data scientist,15-2021.00,"[agent based modeler, algebraist, applied math...",0.73329
9,1,data scientist,15-1133.00,"[applications analyst, automation engineer, be...",0.677256


In [275]:
score_df = score_df.sort_values('score_all', ascending=False).drop_duplicates(['id'])
score_df = score_df.sort_values('id')
score_df.head(10)

Unnamed: 0,id,title_processed,identifier,titles_processed,score_all
0,1,data scientist,15-1111.00,[artificial intelligence specialist ai special...,0.749518
35,2,business intelligence analyst,15-1199.08,"[analytical data miner, business analyst, busi...",0.87363
57,3,human resources data scientist,15-2041.02,"[clinical applications director, clinical bios...",0.810885
66,4,lead human resource data scientist,15-2041.00,"[analytical statistician, applied scientist, a...",0.828754
82,5,machine learning engineer,19-2099.01,"[source intelligence analyst, data analytics c...",0.811024
100,6,data scientist,15-1111.00,[artificial intelligence specialist ai special...,0.749518
126,7,associate data scientist,15-2041.00,"[analytical statistician, applied scientist, a...",0.84121
140,8,data scientist,15-1111.00,[artificial intelligence specialist ai special...,0.749518
162,9,data scientist ai@unity,19-2099.01,"[source intelligence analyst, data analytics c...",0.723357
180,10,data scientist,15-1111.00,[artificial intelligence specialist ai special...,0.749518


In [282]:
df1 = score_df.drop(['titles_processed'], axis=1)
df2 = onet_oc_df[['identifier', 'name']].drop_duplicates()

get_occupation = df1.merge(df2, on='identifier', how='left')
get_occupation.head(5)

Unnamed: 0,id,title_processed,identifier,score_all,name
0,1,data scientist,15-1111.00,0.749518,Computer and Information Research Scientists
1,2,business intelligence analyst,15-1199.08,0.87363,Business Intelligence Analysts
2,3,human resources data scientist,15-2041.02,0.810885,Clinical Data Managers
3,4,lead human resource data scientist,15-2041.00,0.828754,Statisticians
4,5,machine learning engineer,19-2099.01,0.811024,Remote Sensing Scientists and Technologists


In [286]:
# Save file
get_occupation.to_csv('results/title_occupation.csv', index=False)