In [26]:
import numpy as np
import pandas as pd
import sqlite3
import spacy

In [27]:
# Download pretrained enlgish model
try:
    import en_core_web_sm
except:
    !python -m spacy download en_core_web_sm
    import en_core_web_sm

In [28]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("collectors/data.sqlite3")
job_df = pd.read_sql_query("SELECT * from job_post", con)
con.close()

In [29]:
# Verify that result of SQL query is stored in the dataframe
job_df.head()

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist


In [30]:
onet_competencies = 'datasets/competencies.csv'
onet_df = pd.read_csv(onet_competencies,index_col=0)
onet_df[onet_df['occupation'] == 'Computer and Information Research Scientists'].head(5)

Unnamed: 0,occupation,competency,category,description
0,Computer and Information Research Scientists,Source code management SCM software,Technology Skills,Development environment software
1,Computer and Information Research Scientists,Microsoft Azure,Technology Skills,Development environment software
2,Computer and Information Research Scientists,Visualization,Abilities,The ability to imagine how something will look...
3,Computer and Information Research Scientists,Free-field speakers,Tools Used,Loudspeakers
4,Computer and Information Research Scientists,Data visualization software,Technology Skills,Analytical or scientific software


In [31]:
def process_text(text):
    doc = nlp(text)
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        # result.append(token.lemma_)
        result.append(token.text)
    return " ".join(result)

In [32]:
nlp = en_core_web_sm.load()
nlp.Defaults.stop_words.add("\n")

In [33]:
onet_comp = np.array(onet_df['competency'])
onet_desc = np.array(onet_df['description'])

In [34]:
onet_comp = np.array([process_text(comp.lower()) for comp in onet_comp])
onet_desc = np.array([process_text(desc.lower()) for desc in onet_desc])

In [35]:
# Filter ONet DF to keep only required category of competencies

category_list = ['Task Statements', 'Work Activities']

print("ONET length before filter: ", len(onet_df))
filtered_onet_df = onet_df.loc[onet_df['category'].isin(category_list)]
print("ONET length after filter:  ", len(filtered_onet_df))
filtered_onet_df.head(5)

ONET length before filter:  4938
ONET length after filter:   817


Unnamed: 0,occupation,competency,category,description
9,Computer and Information Research Scientists,Evaluate project plans and proposals to assess...,Task Statements,Core
10,Computer and Information Research Scientists,Processing Information,Work Activities,"Compiling, coding, categorizing, calculating, ..."
15,Computer and Information Research Scientists,Estimating the Quantifiable Characteristics of...,Work Activities,"Estimating sizes, distances, and quantities; o..."
26,Computer and Information Research Scientists,Documenting/Recording Information,Work Activities,"Entering, transcribing, recording, storing, or..."
38,Computer and Information Research Scientists,"Identifying Objects, Actions, and Events",Work Activities,"Identifying information by categorizing, estim..."


In [36]:
occupations = 'results/title_occupation.csv'
occupations = pd.read_csv(occupations)
occupations['description'] = job_df['description']
occupations.head(5)

Unnamed: 0,id,title,identifier,score_all,name,description
0,1,Data Scientist,15-1111.00,0.883895,Computer and Information Research Scientists,Do you want a meaningful role in a company tha...
1,2,Business Intelligence Analyst,15-1199.08,0.866399,Business Intelligence Analysts,Brand: Glentel Corporate\nLocation: Burnaby Of...
2,3,Human Resources Data Scientist,15-1111.00,0.737225,Computer and Information Research Scientists,2 x newly created Data Scientist opportunities...
3,4,Lead - Human Resource Data Scientist,15-1111.00,0.738833,Computer and Information Research Scientists,Newly created data science lead embedded withi...
4,5,Machine Learning Engineer,15-1111.00,0.847708,Computer and Information Research Scientists,"Who We are\nFounded in 2016, Skycope Technolog..."


In [37]:
job_desc = occupations['description'].values

In [38]:
job_desc = np.array([desc.lower() for desc in job_desc])

In [39]:
job_lengths = np.array([len(list(nlp(str(desc)).sents)) for desc in job_desc])

In [40]:
job_lengths[:15]

array([ 28,  32, 228, 236,  30,  20,  22,  21,  25,  46,  32,  48,  35,
        38,  23])

In [41]:
# Choose to keep jobs which have sentences less than a threshold
MAX_SENTENCE_LENGTH = 40
job_ids = np.where(job_lengths < MAX_SENTENCE_LENGTH)[0]

In [42]:
job_ids[:10]

array([ 0,  1,  4,  5,  6,  7,  8, 10, 12, 13])

In [43]:
FIRST_N = 50

small_job_desc = job_desc[job_ids][:FIRST_N]

small_job_desc = np.array([np.array([process_text(line.text).replace("\n", '') \
                                for line in nlp(str(desc)).sents]) \
                                for desc in small_job_desc])

In [44]:
small_job_desc[0][:5]

array(['want meaningful role company making difference world',
       'want involved important environmental resource areas today',
       'want learn involved developing deploying machine learning predictive analytics solutions colleagues years research development experience',
       'join energetic growing team help revolutionize industry',
       'company founded 2003 aquatic informatics provides software solutions address critical water data management analytics compliance challenges rapidly growing water industry'],
      dtype='<U368')

In [45]:
# ***************************************************************
# BEWARE: EXTREMELY TIME CONSUMING
# TRY FOR SMALL NUMBER OF JOBS
# IMPRACTICAL FOR A LARGE SET OF JOBS SIMULTANEOUSLY
# PRACTICAL IMPLICATION: PERFORM THIS WHENEVER A NEW JOB IS ADDED
# ***************************************************************

scores = []
comp_ids = []

for i, desc in enumerate(small_job_desc):
    title = occupations['name'].iloc[i]
    idss = filtered_onet_df.index[filtered_onet_df['occupation'] == title].tolist()
    sent_score = []
    sent_comp_id = []
    print("COMPETENCIES {}, SENTENCES {}".format(len(idss), len(desc)))
    print("------------------------------")
    for j, sentence in enumerate(desc):
        sentence = nlp(str(sentence))
        temp_score = []
        # temp_comp_id = np.empty()
        for ids in idss:
            sim1 = sentence.similarity(nlp(str(onet_comp[ids])))
            sim2 = sentence.similarity(nlp(str(onet_desc[ids])))
            temp_score.append(max(sim1, sim2))
    
        max_score = np.max(temp_score)
        max_comp_id = idss[np.argmax(temp_score)]
        sent_score.append(max_score)
        sent_comp_id.append(max_comp_id)
        print("JOB {}, SENTENCE {} DONE".format(i, j))

    scores.append(sent_score)
    comp_ids.append(sent_comp_id)

ETENCIES 37, SENTENCES 25
------------------------------
JOB 21, SENTENCE 0 DONE
JOB 21, SENTENCE 1 DONE
JOB 21, SENTENCE 2 DONE
JOB 21, SENTENCE 3 DONE
JOB 21, SENTENCE 4 DONE
JOB 21, SENTENCE 5 DONE
JOB 21, SENTENCE 6 DONE
JOB 21, SENTENCE 7 DONE
JOB 21, SENTENCE 8 DONE
JOB 21, SENTENCE 9 DONE
JOB 21, SENTENCE 10 DONE
JOB 21, SENTENCE 11 DONE
JOB 21, SENTENCE 12 DONE
JOB 21, SENTENCE 13 DONE
JOB 21, SENTENCE 14 DONE
JOB 21, SENTENCE 15 DONE
JOB 21, SENTENCE 16 DONE
JOB 21, SENTENCE 17 DONE
JOB 21, SENTENCE 18 DONE
JOB 21, SENTENCE 19 DONE
JOB 21, SENTENCE 20 DONE
JOB 21, SENTENCE 21 DONE
JOB 21, SENTENCE 22 DONE
JOB 21, SENTENCE 23 DONE
JOB 21, SENTENCE 24 DONE
COMPETENCIES 37, SENTENCES 12
------------------------------
JOB 22, SENTENCE 0 DONE
JOB 22, SENTENCE 1 DONE
JOB 22, SENTENCE 2 DONE
JOB 22, SENTENCE 3 DONE
JOB 22, SENTENCE 4 DONE
JOB 22, SENTENCE 5 DONE
JOB 22, SENTENCE 6 DONE
JOB 22, SENTENCE 7 DONE
JOB 22, SENTENCE 8 DONE
JOB 22, SENTENCE 9 DONE
JOB 22, SENTENCE 10 DONE
JO

In [46]:
competency = []
for i in range(len(scores)):
    tup = list(zip(scores[i], comp_ids[i]))
    tup.sort(reverse=True)

    top = []
    visited = []
    for t in tup:
        if t[1] not in visited:
            top.append(t)
            visited.append(t[1])

    idss = [x[1] for x in top]
    if len(idss) > 20:
        idss = idss[:20]

    competency_names = []
    for ids in idss:
        competency_names.append(onet_df['category'].iloc[ids] + " | " + \
                                onet_df['competency'].iloc[ids] + " | " + \
                                onet_df['description'].iloc[ids])

    competency.append(competency_names)

In [47]:
result_df = occupations.iloc[job_ids[:FIRST_N]]

In [48]:
result_df['onet_competency'] = competency

In [49]:
result_df.head()

Unnamed: 0,id,title,identifier,score_all,name,description,onet_competency
0,1,Data Scientist,15-1111.00,0.883895,Computer and Information Research Scientists,Do you want a meaningful role in a company tha...,[Work Activities | Communicating with Persons ...
1,2,Business Intelligence Analyst,15-1199.08,0.866399,Business Intelligence Analysts,Brand: Glentel Corporate\nLocation: Burnaby Of...,"[Work Activities | Monitor Processes, Material..."
4,5,Machine Learning Engineer,15-1111.00,0.847708,Computer and Information Research Scientists,"Who We are\nFounded in 2016, Skycope Technolog...",[Work Activities | Communicating with Persons ...
5,6,Data Scientist,15-1111.00,0.846623,Computer and Information Research Scientists,BrainStation is a global leader in digital ski...,[Work Activities | Interacting With Computers ...
6,7,Associate Data Scientist,15-1111.00,0.872069,Computer and Information Research Scientists,We are looking for a Data Scientist to support...,[Work Activities | Interacting With Computers ...


In [50]:
# Save file
result_df.to_csv('results/responsibilites.csv', index=False)