In [None]:
### DON'T DO "RESTART AND RUN ALL CELLS" ON THIS NOTEBOOK ###
### THERE'S A CELL THAT TAKES VERY LONG TO RUN AND IT ONLY NEEDED TO BE RUN ONCE ###

In [642]:
import pandas as pd
import numpy as np
import requests
import string
import collections
import random
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textaugment import Wordnet

In [534]:
# get dataframes from CSV files

df_occupations = pd.read_csv('../data/ESCO/occupations_en.csv')
df_occupations_aug = pd.read_csv('../data/ESCO/occupations_augmented.csv')


In [678]:
### run this cell & extract skills, alternative labels, and job description from the ESCO API ###
### this cell uses the original occupations dataframe (not augmented) ###

# select a role
role = 'pig breeder'

# get uri for selected role
role_extract = df_occupations[df_occupations['preferredLabel'] == role]
role_uri = role_extract['conceptUri']

# get role information from the ESCO API
url = "https://ec.europa.eu/esco/api/resource/skill"
params = {
    'uri': role_uri,
    'language': 'en'
         }
api_skills = requests.get(url=url, params=params).json()

# this line gets you the whole API url; easier to read on your browser
requests.get(url=url, params=params).url

# gets a list of all skill descriptions and extracts the skill title only
api_skills_dict = api_skills.get('_links').get('hasEssentialSkill')
skills_list = [skill.get('title') for skill in api_skills_dict]

# gets list of alternative labels from the API
alt_labels_list = api_skills.get('alternativeLabel').get('en')

# gets job description from the API
job_description = api_skills.get('description').get('en').get('literal')

print('***** SKILL LIST:', skills_list)
print('***** ALTERNATIVE LABELS:', alt_labels_list)
print('***** JOB DESCRIPTION:', job_description)

***** SKILL LIST: ['manage animal biosecurity', 'animal nutrition', 'monitor the welfare of animals', 'create animal records', 'health and safety regulations', 'manage pig breeding', 'control animal movement', 'feed livestock', 'provide nutrition to animals', 'provide first aid to animals', 'select livestock', 'monitor livestock', 'manage the health and welfare of livestock', 'assist animal birth', 'assist in transportation of animals', 'carry out specialised procedures for pigs', 'livestock reproduction', 'dispose of dead animals', 'livestock feeding', 'maintain professional records', 'operate farm equipment', 'administer treatment to animals', 'care for juvenile animals', 'manage livestock', 'maintain animal accommodation', 'livestock species', 'manage animal hygiene', 'signs of animal illness', 'administer specific drugs to facilitate breeding', 'animal welfare legislation']
***** ALTERNATIVE LABELS: ['pig breeders', 'pig specialist', 'pig rearer']
***** JOB DESCRIPTION: Pig breeder

In [539]:
# get training and testin sets, as well as the entire dataframe

X_all = df_occupations_aug['description'][0:35824]
X_train = X_all[0:20000]
X_test = X_all[20000:35824]

In [28]:
# Preprocessing function for job descriptions

def preprocessing(sentence):
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    # split into tokens again after Lemmatizing
    sentence = word_tokenize(sentence)
    
    return sentence

In [542]:
# applying preprocessing as in Gensim tutorial

def read_corpus(corpus, tokens_only=False):
    for i, line in enumerate(corpus):
        tokens = preprocessing(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

all_corpus = list(read_corpus(X_all))
# train_corpus = list(read_corpus(X_train))
# test_corpus = list(read_corpus(X_test, tokens_only=True))

In [665]:
# initialize 'job2vec' model

job2vec_model = Doc2Vec(
    vector_size=50,
    min_count=2,
    epochs=40,
    window=10,
    alpha=0.01)

job2vec_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x1482fa4f0>

In [666]:
# build a vocabulary

job2vec_model.build_vocab(all_corpus)

In [667]:
# train model!
job2vec_model.train(
    all_corpus,
    total_examples=job2vec_model.corpus_count,
    epochs=job2vec_model.epochs,
)

In [675]:
# describe your job and get your prediction!

describe_your_job = \
"Coordinated design and setting up of technical solutions for temporary water and wastewater installations"



In [676]:
# preprocess your job description
new_description = preprocessing(describe_your_job)

# create inferred vector from your preprocessed job description
new_inferred_vector = job2vec_model.infer_vector(new_description)

# get similar job descriptions from model
similar_to_new = job2vec_model.dv.most_similar([new_inferred_vector], topn=len(job2vec_model.dv))

In [677]:
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(new_description)))
print('LISTING 3 MOST SIMILAR JOB ROLES & DESCRIPTIONS \n')
for label, index in [('* MOST SIMILAR', 0), ('* 2ND MOST SIMILAR', 1), ('* 3RD MOST SIMILAR', 2)]:
    print(label + ': ' + df_occupations_aug.loc[similar_to_new[index][0]]['preferredLabel'])
    print(df_occupations_aug.loc[similar_to_new[index][0]]['description'])
    print(f'Similarity score: {round(similar_to_new[index][1]*100,1)} % \n')


Test Document (695): «coordinated design setting technical solutions temporary water wastewater installations»

LISTING 3 MOST SIMILAR JOB ROLES & DESCRIPTIONS 

* MOST SIMILAR: chemical tester
chemical quizzer are responsible for the rapid on-the-spot psychoanalysis of steel test man incoming from the alloy production store for the role of timely chastening of the chemical composing of the liquidness metal.
Similarity score: 63.9 % 

* 2ND MOST SIMILAR: veterinary scientist
Veterinary scientist develop and do research in animal models, compare basic biology across animals, and translate research findings to different species, including humans.
Similarity score: 61.4 % 

* 3RD MOST SIMILAR: veterinary researcher
veterinary scientist develop and do research in animal models, compare basic biology across animals, and translate research finding to different species, including humans.
Similarity score: 60.9 % 



In [660]:
# now we will:
# 1. take the jobs we got from the previous model run,
# 2. pass them through the ESCO API,
# 3. extract the skills they're associated with, 
# 4. append those skills to the input job description ("describe_your_job")
# 5. preprocess and run the model again!

url = "https://ec.europa.eu/esco/api/resource/skill"

job_description_w_skills = describe_your_job

for index in range(3):
    
    # get URI of each of the suggested jobs
    suggested_job_uri = df_occupations_aug.loc[similar_to_new[index][0]]['conceptUri']
    
    # pass them through ESCO API
    params = {'uri': suggested_job_uri, 'language': 'en'}
    api_skills = requests.get(url=url, params=params).json()
    time.sleep(1) # keeping the API happy :)
    
    # gets a list of all skill descriptions and extracts the skill title only
    api_skills_dict = api_skills.get('_links').get('hasEssentialSkill')
    skills_list = [skill.get('title') for skill in api_skills_dict]

    # appends skills to the job description you gave us :)
    job_description_w_skills = job_description_w_skills + ' ' + ' '.join(skills_list)
    
# preprocess & infer vector again and get new similar jobs
new_description_w_skills = preprocessing(job_description_w_skills)
new_inferred_vector_w_skills = job2vec_model.infer_vector(new_description_w_skills)
similar_to_new_w_skills = job2vec_model.dv.most_similar([new_inferred_vector_w_skills], topn=len(job2vec_model.dv))

# run model again
print('LISTING 3 MOST SIMILAR JOB ROLES & DESCRIPTIONS -- NOW WITH ESCO SKILLS \n')
for label, index in [('* MOST SIMILAR', 0), ('* 2ND MOST SIMILAR', 1), ('* 3RD MOST SIMILAR', 2)]:
    print(label + ': ' + df_occupations_aug.loc[similar_to_new_w_skills[index][0]]['preferredLabel'])
    print(df_occupations_aug.loc[similar_to_new_w_skills[index][0]]['description'])
    print(f'Similarity score: {round(similar_to_new_w_skills[index][1]*100,1)} % \n')

LISTING 3 MOST SIMILAR JOB ROLES & DESCRIPTIONS -- NOW WITH ESCO SKILLS 

* MOST SIMILAR: mining hydrologist
mining geotechnical engineers in mining perform engineering, hydrological and geological test and psychoanalysis to improve the refuge and efficiency of mineral operations. they oversee the assemblage of sampling and the taking of measurement using geotechnical investigation method and techniques. they model the mechanical behavior of the rock passel and contribute to the design of the mine geometry.
Similarity score: 55.2 % 

* 2ND MOST SIMILAR: engine design technologist
engine architect carry out engineering responsibility in designing mechanical equipment such as auto and all case of engines. they also supervise their facility and maintenance.
Similarity score: 49.1 % 

* 3RD MOST SIMILAR: nuclear engineers
nuclear engineer plan and invention engineering equipment and summons in nuclear plant and sites. they engage in engineering activities pertaining to nuclear index plants