In [1]:
### DON'T DO "RESTART AND RUN ALL CELLS" ON THIS NOTEBOOK ###
### THERE'S A CELL THAT TAKES VERY LONG TO RUN AND IT ONLY NEEDED TO BE RUN ONCE ###

In [2]:
import pandas as pd
import numpy as np
import requests
import string
import collections
import random
import time
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textaugment import Wordnet

In [3]:
# get dataframes from CSV files

df_occupations = pd.read_csv('../data/ESCO/occupations_en.csv')
df_occupations_aug = pd.read_csv('../data/ESCO/occupations_augmented.csv')
df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_skills.csv')

  df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_skills.csv')


In [4]:
df_occ_n_skills = df_occ_n_skills.filter(items=['preferredLabel', 'description', 'skills'])
df_occ_n_skills = df_occ_n_skills.reindex(columns=['preferredLabel','description', 'skills'])
df_occ_n_skills.rename(columns={'preferredLabel': 'job_title'}, inplace=True)
df_occ_n_skills['description_input'] = 0
df_occ_n_skills['skills_input'] = 0
df_occ_n_skills.head()

Unnamed: 0,job_title,description,skills,description_input,skills_input
0,technical director,Technical directors realise the artistic visio...,"adapt to artists' creative demands, organise r...",0,0
1,metal drawing machine operator,Metal drawing machine operators set up and ope...,"cold drawing processes, monitor moving workpie...",0,0
2,precision device inspector,Precision device inspectors make sure precisio...,"precision measuring instruments, monitor machi...",0,0
3,air traffic safety technician,Air traffic safety technicians provide technic...,"air transport law, aircraft flight control sys...",0,0
4,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,"develop revenue generation strategies, ensure ...",0,0


In [5]:
df_occ_n_skills.iloc[0,1]

'Technical directors realise the artistic visions of the creators within technical constraints. They coordinate the operations of various production units, such as scene, wardrobe, sound and lighting, and make-up. They adapt the prototype and study the feasibility, implementation, operation and technical monitoring of the artistic project. They are also responsible for the stage equipment and technical equipment.'

In [6]:
# create description_input and skills_input, which are strings on which the model will be fit

for row, index in tqdm(df_occ_n_skills.iterrows()):
    underscored_job_title = index['job_title'].replace(" ", "_")
    this_rows_description_input = underscored_job_title + ' ' + index['description']
    this_rows_skills_input = underscored_job_title + ' ' + index['skills']
    df_occ_n_skills.iloc[row,-2] = this_rows_description_input
    df_occ_n_skills.iloc[row,-1] = this_rows_skills_input

35824it [00:14, 2533.32it/s]


In [7]:
df_occ_n_skills.head()

Unnamed: 0,job_title,description,skills,description_input,skills_input
0,technical director,Technical directors realise the artistic visio...,"adapt to artists' creative demands, organise r...",technical_director Technical directors realise...,technical_director adapt to artists' creative ...
1,metal drawing machine operator,Metal drawing machine operators set up and ope...,"cold drawing processes, monitor moving workpie...",metal_drawing_machine_operator Metal drawing m...,metal_drawing_machine_operator cold drawing pr...
2,precision device inspector,Precision device inspectors make sure precisio...,"precision measuring instruments, monitor machi...",precision_device_inspector Precision device in...,precision_device_inspector precision measuring...
3,air traffic safety technician,Air traffic safety technicians provide technic...,"air transport law, aircraft flight control sys...",air_traffic_safety_technician Air traffic safe...,air_traffic_safety_technician air transport la...
4,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,"develop revenue generation strategies, ensure ...",hospitality_revenue_manager Hospitality revenu...,hospitality_revenue_manager develop revenue ge...


In [8]:
# get entire dataframe as dataset according to Qiewi's suggestion
# concatenate the job_title:skills list to the end of df_occ_n_skills

X_all = pd.concat([df_occ_n_skills['description_input'], df_occ_n_skills['skills_input']]).reset_index(drop=True)

In [9]:
### run this cell & extract skills, alternative labels, and job description from the ESCO API ###
### this cell uses the original occupations dataframe (not augmented) ###

# select a role
role = 'pig breeder'

# get uri for selected role
role_extract = df_occupations[df_occupations['preferredLabel'] == role]
role_uri = role_extract['conceptUri']

# get role information from the ESCO API
url = "https://ec.europa.eu/esco/api/resource/skill"
params = {
    'uri': role_uri,
    'language': 'en'
         }
api_skills = requests.get(url=url, params=params).json()

# this line gets you the whole API url; easier to read on your browser
requests.get(url=url, params=params).url

# gets a list of all skill descriptions and extracts the skill title only
api_skills_dict = api_skills.get('_links').get('hasEssentialSkill')
skills_list = [skill.get('title') for skill in api_skills_dict]

# gets list of alternative labels from the API
alt_labels_list = api_skills.get('alternativeLabel').get('en')

# gets job description from the API
job_description = api_skills.get('description').get('en').get('literal')

print('***** SKILL LIST:', skills_list)
print('***** ALTERNATIVE LABELS:', alt_labels_list)
print('***** JOB DESCRIPTION:', job_description)

***** SKILL LIST: ['maintain professional records', 'feed livestock', 'operate farm equipment', 'livestock feeding', 'provide first aid to animals', 'animal welfare legislation', 'administer treatment to animals', 'health and safety regulations', 'signs of animal illness', 'provide nutrition to animals', 'livestock species', 'manage the health and welfare of livestock', 'animal nutrition', 'manage livestock', 'dispose of dead animals', 'assist in transportation of animals', 'manage pig breeding', 'monitor the welfare of animals', 'carry out specialised procedures for pigs', 'maintain animal accommodation', 'livestock reproduction', 'administer specific drugs to facilitate breeding', 'create animal records', 'select livestock', 'care for juvenile animals', 'control animal movement', 'manage animal hygiene', 'manage animal biosecurity', 'monitor livestock', 'assist animal birth']
***** ALTERNATIVE LABELS: ['pig breeders', 'pig specialist', 'pig rearer']
***** JOB DESCRIPTION: Pig breeder

In [10]:
# Preprocessing function for job descriptions

def preprocessing(sentence):
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    # split into tokens again after Lemmatizing
    sentence = word_tokenize(sentence)
    
    return sentence

In [11]:
# applying preprocessing as in Gensim tutorial

def read_corpus(corpus, tokens_only=False):
    for i, line in enumerate(corpus):
        tokens = preprocessing(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

all_corpus = list(read_corpus(X_all))

In [40]:
# initialize 'job2vec' model

job2vec_model = Doc2Vec(
  
    vector_size=70,
    min_count=5,
    epochs=75,
    window=20,
    start_alpha = 0.01
    alpha=0.025,
    dm_concat=1,
    dbow_words = 1,
    shrink_windows=True)

job2vec_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7fa9a83ca070>

In [41]:
# build vocabulary

job2vec_model.build_vocab(all_corpus)

In [42]:
# train model!

job2vec_model.train(
    all_corpus,
    total_examples=job2vec_model.corpus_count,
    epochs=job2vec_model.epochs
    )

In [43]:
# describe your job and get your prediction!

describe_your_job = \
"realise the artistic visions of the creators within technical constraints. They coordinate the operations of various production units, such as scene, wardrobe, sound and lighting, and make-up. They adapt the prototype and study the feasibility, implementation, operation and technical monitoring of the artistic project. They are also responsible for the stage equipment and technical equipment."


In [44]:
# preprocess your job description
new_description = preprocessing(describe_your_job)

# create inferred vector from your preprocessed job description
new_inferred_vector = job2vec_model.infer_vector(new_description)

# get similar job descriptions from model
similar_to_new = job2vec_model.dv.most_similar([new_inferred_vector])

In [49]:
print(f"Test Document: {' '.join(new_description)} \n")
print('LISTING 3 MOST SIMILAR JOB ROLES & DESCRIPTIONS \n')

for label, index in [('* MOST SIMILAR', 0), ('* 2ND MOST SIMILAR', 1), ('* 3RD MOST SIMILAR', 2)]:

    
    if similar_to_new[index][0] < len(df_occ_n_skills):
        new_index = similar_to_new[index][0]
    else:
        new_index = similar_to_new[index][0] - len(df_occ_n_skills)

    print(label + ': ' + df_occ_n_skills.loc[new_index]['job_title'])
    print(df_occ_n_skills.loc[new_index]['description'])
    print(f'Similarity score: {round(similar_to_new[index][1]*100,1)} % \n')


Test Document: realise artistic visions creators within technical constraints coordinate operations various production units scene wardrobe sound lighting makeup adapt prototype study feasibility implementation operation technical monitoring artistic project also responsible stage equipment technical equipment 

LISTING 3 MOST SIMILAR JOB ROLES & DESCRIPTIONS 

* MOST SIMILAR: contact lens specialist
optician help to improve and correct an individual's vision. they fit spectacle lense and frames, contact lenses, and other gimmick according to the specification of the individual. their range of recitation varies according to national regulating and they might operate according to prescription provided by a specialised doctor in ophthalmology or an optometrist in the countries where requested.
Similarity score: 47.6 % 

* 2ND MOST SIMILAR: stenographer
court newsman type in bible processor or any other package each one of the word mentioned in the courtroom. they transcript the hearing t

In [None]:
similar_to_new