In [64]:
import pandas as pd
import numpy as np
import requests
import string
import collections
import random
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [2]:
df_occupations = pd.read_csv('../data/ESCO/occupations_en.csv')

In [3]:
# search for all jobs that have 'manager' in their title  --- IGNORE this cell for now
url_title = "https://ec.europa.eu/esco/api/search"

params = {
    'language': 'en',
    'type': 'occupation',
    'text': 'pig'
         }

api_title = requests.get(url=url_title, params=params).json()

In [4]:
# select a role
role = 'cattle breeder'

In [5]:
### run this cell & extract skills, alternative labels, and job description from the ESCO API ###

# get uri for selected role
role_extract = df_occupations[df_occupations['preferredLabel'] == role]
role_uri = role_extract['conceptUri']

# get role information from the ESCO API
url = "https://ec.europa.eu/esco/api/resource/skill"
params = {
    'uri': role_uri,
    'language': 'en',
         }
api_skills = requests.get(url=url, params=params).json()

# this line gets you the whole API url; easier to read on your browser
requests.get(url=url, params=params).url

# gets a list of all skill descriptions and extracts the skill title only
api_skills_dict = api_skills.get('_links').get('hasEssentialSkill')
skills_list = [skill.get('title') for skill in api_skills_dict]

# gets list of alternative labels from the API
alt_labels_list = api_skills.get('alternativeLabel').get('en')

# gets job description from the API
job_description = api_skills.get('description').get('en').get('literal')

print('***** SKILL LIST:', skills_list)
print('***** ALTERNATIVE LABELS:', alt_labels_list)
print('***** JOB DESCRIPTION:', job_description)

***** SKILL LIST: ['monitor the welfare of animals', 'administer specific drugs to facilitate breeding', 'care for juvenile animals', 'operate farm equipment', 'provide first aid to animals', 'animal nutrition', 'signs of animal illness', 'feed livestock', 'manage animal hygiene', 'maintain professional records', 'perform milk control', 'maintain animal accommodation', 'select livestock', 'manage livestock', 'create animal records', 'assist in transportation of animals', 'manage the health and welfare of livestock', 'dispose of dead animals', 'control animal movement', 'monitor livestock', 'manage cattle breeding', 'livestock reproduction', 'provide nutrition to animals', 'milk animals', 'health and safety regulations', 'administer treatment to animals', 'manage animal biosecurity', 'animal welfare legislation', 'assist animal birth', 'livestock species']
***** ALTERNATIVE LABELS: ['cattle specialist', 'cattle breeders', 'cattle rearer']
***** JOB DESCRIPTION: Cattle breeders oversee t

In [35]:
# get an training set with 2500 out of the whole 3006 rows 

X_train = df_occupations['description'][0:2500]
X_test = df_occupations['description'][2500:3006]

In [28]:
# Preprocessing function for job descriptions

def preprocessing(sentence):
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    # split into tokens again after Lemmatizing
    sentence = word_tokenize(sentence)
    
    return sentence

In [33]:
# applying preprocessing as in Gensim tutorial

def read_corpus(corpus, tokens_only=False):
    for i, line in enumerate(corpus):
        tokens = preprocessing(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(X_train))
test_corpus = list(read_corpus(X_test, tokens_only=True))

In [36]:
# initialize 'job2vec' model

job2vec_model = Doc2Vec(vector_size=50, min_count=2, epochs=40)

job2vec_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x1361dd7f0>

In [38]:
# build a vocabulary
job2vec_model.build_vocab(train_corpus)

In [39]:
job2vec_model.train(train_corpus, total_examples=job2vec_model.corpus_count, epochs=job2vec_model.epochs)

In [51]:
# testing_processed_descriptions = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_test_proc)]
inferred_vector = job2vec_model.infer_vector(test_corpus[0])

In [52]:
inferred_vector

array([-0.20913865,  0.14463197,  0.10529788,  0.27894777, -0.16791576,
        0.01759879, -0.50824565,  0.23740382,  0.00536443, -0.1635939 ,
       -0.23653965, -0.473574  , -0.3318227 ,  0.25927296,  0.03755832,
       -0.24730153, -0.35329896, -0.27647915,  0.33433697,  0.3072264 ,
        0.09328657,  0.29518372,  0.04345774,  0.1269041 , -0.25285158,
       -0.15895481, -0.21856923, -0.38978544, -0.24860829, -0.100328  ,
        0.19492848, -0.43576676,  0.20400758,  0.19763418,  0.19214346,
        0.07220768, -0.0890172 ,  0.02142998,  0.2988561 ,  0.02141867,
        0.33612537, -0.10444368, -0.01854241,  0.3629613 , -0.39273143,
        0.04501669, -0.00986005, -0.46097463,  0.20070037, -0.0220908 ],
      dtype=float32)

In [62]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = job2vec_model.infer_vector(train_corpus[doc_id].words)
    sims = job2vec_model.dv.most_similar([inferred_vector], topn=len(job2vec_model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [63]:
counter = collections.Counter(ranks)
print(counter)

Counter({0: 2322, 1: 47, 2: 15, 3: 7, 4: 7, 13: 4, 45: 4, 9: 3, 11: 3, 20: 3, 12: 3, 37: 3, 22: 3, 7: 3, 72: 3, 5: 3, 10: 3, 26: 2, 6: 2, 113: 2, 55: 2, 59: 2, 23: 2, 25: 2, 31: 2, 15: 2, 94: 2, 213: 2, 41: 1, 14: 1, 147: 1, 18: 1, 19: 1, 40: 1, 79: 1, 109: 1, 64: 1, 62: 1, 93: 1, 67: 1, 33: 1, 121: 1, 246: 1, 416: 1, 87: 1, 50: 1, 29: 1, 68: 1, 42: 1, 153: 1, 32: 1, 56: 1, 17: 1, 139: 1, 30: 1, 39: 1, 8: 1, 81: 1, 66: 1, 36: 1, 60: 1, 122: 1, 35: 1, 82: 1, 24: 1, 125: 1, 166: 1, 124: 1, 21: 1, 53: 1})


In [77]:
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (1836): «language school teachers educate nonagespecific students language native language specialised school bound level education focus less academic aspect language teaching opposed language teachers secondary higher education instead theory practice helpful students reallife situations since choose instruction either business immigration leisure reasons organise classes using variety lesson materials work interactively group assess evaluate individual progress assignments examinations putting emphasis active language skills writing speaking»

Similar Document (1106, 0.8223347067832947): «sign language school teachers educate nonagespecific students sign language teach sign language students without special educational needs deafness organise classes using variety lesson materials work interactively group assess evaluate individual progress assignments examinations»



In [76]:
train_corpus

[TaggedDocument(words=['technical', 'directors', 'realise', 'artistic', 'visions', 'creators', 'within', 'technical', 'constraints', 'coordinate', 'operations', 'various', 'production', 'units', 'scene', 'wardrobe', 'sound', 'lighting', 'makeup', 'adapt', 'prototype', 'study', 'feasibility', 'implementation', 'operation', 'technical', 'monitoring', 'artistic', 'project', 'also', 'responsible', 'stage', 'equipment', 'technical', 'equipment'], tags=[0]),
 TaggedDocument(words=['metal', 'drawing', 'machine', 'operators', 'set', 'operate', 'drawing', 'machines', 'ferrous', 'nonferrous', 'metal', 'products', 'designed', 'provide', 'wires', 'bars', 'pipes', 'hollow', 'profiles', 'tubes', 'specific', 'form', 'reducing', 'crosssection', 'pulling', 'working', 'materials', 'series', 'drawing', 'dies'], tags=[1]),
 TaggedDocument(words=['precision', 'device', 'inspectors', 'make', 'sure', 'precision', 'devices', 'micrometers', 'gauges', 'operate', 'according', 'design', 'specifications', 'may', '

In [93]:
text = 'Responsible for team leadership, planning, works coordination, invoicing, procurement, contract management, tendering and cost control'
text_p = preprocessing(text)

In [94]:
text_p

['responsible',
 'team',
 'leadership',
 'planning',
 'works',
 'coordination',
 'invoicing',
 'procurement',
 'contract',
 'management',
 'tendering',
 'cost',
 'control']

In [95]:
inferred_vector_text = job2vec_model.infer_vector(text_p)

In [98]:
# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(text_p)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (1836): «responsible team leadership planning works coordination invoicing procurement contract management tendering cost control»

Similar Document (1106, 0.8223347067832947): «sign language school teachers educate nonagespecific students sign language teach sign language students without special educational needs deafness organise classes using variety lesson materials work interactively group assess evaluate individual progress assignments examinations»



In [80]:
inferred_vector_text

array([ 1.62220672e-01,  5.99719822e-01, -5.84440529e-02,  1.67158499e-01,
        8.65466669e-02,  2.75833644e-02, -3.82897668e-02,  9.73663181e-02,
       -2.69168109e-01,  2.75540084e-01, -6.23327531e-02,  4.96084601e-01,
        1.11821994e-01,  4.59516495e-02, -3.18699121e-01,  1.75700068e-01,
        3.80104303e-01, -1.23025686e-01,  5.30903399e-01,  3.03188972e-02,
       -7.78928995e-02,  3.26654077e-01, -7.74524331e-01,  1.26306236e-01,
        1.16378672e-01,  1.60084158e-01, -7.46155858e-01,  5.72540104e-01,
        8.56606424e-01,  6.62409067e-02, -1.83239117e-01, -4.02189821e-01,
        7.47376084e-02, -6.74358130e-01,  5.07880747e-02, -2.81599820e-01,
        7.33419433e-02,  5.21608353e-01, -6.23869121e-01,  2.53139526e-01,
        5.36958512e-04,  3.60647589e-01,  1.01286128e-01, -3.73330086e-01,
        8.62140507e-02, -3.40351820e-01,  1.04380578e-01, -1.99851647e-01,
       -4.51864839e-01, -3.30648273e-02], dtype=float32)

In [81]:
similar_p = job2vec_model.dv.most_similar([inferred_vector_text], topn=len(job2vec_model.dv))

In [99]:
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(text_p)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % job2vec_model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (1836): «responsible team leadership planning works coordination invoicing procurement contract management tendering cost control»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (2499, 0.8865613341331482): «prepress operators create prepress proof sample finished product expected look like monitor printing quality ensuring graphics colors content meet required quality technical standards»

MEDIAN (1970, 0.27825871109962463): «paperhangers specialised hanging wallpaper apply adhesives paper wall case reinforced wallpaper fix paper straight well aligned avoiding inclusion air bubbles»

LEAST (1726, -0.3624192178249359): «medical practice managers manage daytoday operations medical practice oversee staff business side practice»



In [86]:
df_occupations.iloc[1529,:]

conceptType                                                       Occupation
conceptUri                 http://data.europa.eu/esco/occupation/7cb71c5f...
iscoGroup                                                               9122
preferredLabel                                               vehicle cleaner
altLabels                  automotive valeter\nmotor vehicle valeter\nren...
hiddenLabels                                                             NaN
status                                                              released
modifiedDate                                        2021-10-06T09:07:27.622Z
regulatedProfessionNote    http://data.europa.eu/esco/regulated-professio...
scopeNote                                                                NaN
definition                                                               NaN
inScheme                   http://data.europa.eu/esco/concept-scheme/occu...
description                Vehicle cleaners clean and polish surfaces of ...