In [40]:
import pandas as pd
import numpy as np
import requests
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [13]:
df_occupations = pd.read_csv('../data/ESCO/occupations_en.csv')

In [11]:
# search for all jobs that have 'manager' in their title  --- IGNORE this cell for now
url_title = "https://ec.europa.eu/esco/api/search"

params = {
    'language': 'en',
    'type': 'occupation',
    'text': 'pig'
         }

api_title = requests.get(url=url_title, params=params).json()

In [12]:
# select a role
role = 'cattle breeder'

In [14]:
### run this cell & extract skills, alternative labels, and job description from the ESCO API ###

# get uri for selected role
role_extract = df_occupations[df_occupations['preferredLabel'] == role]
role_uri = role_extract['conceptUri']

# get role information from the ESCO API
url = "https://ec.europa.eu/esco/api/resource/skill"
params = {
    'uri': role_uri,
    'language': 'en',
         }
api_skills = requests.get(url=url, params=params).json()

# this line gets you the whole API url; easier to read on your browser
requests.get(url=url, params=params).url

# gets a list of all skill descriptions and extracts the skill title only
api_skills_dict = api_skills.get('_links').get('hasEssentialSkill')
skills_list = [skill.get('title') for skill in api_skills_dict]

# gets list of alternative labels from the API
alt_labels_list = api_skills.get('alternativeLabel').get('en')

# gets job description from the API
job_description = api_skills.get('description').get('en').get('literal')

print('***** SKILL LIST:', skills_list)
print('***** ALTERNATIVE LABELS:', alt_labels_list)
print('***** JOB DESCRIPTION:', job_description)

***** SKILL LIST: ['maintain animal accommodation', 'create animal records', 'livestock species', 'manage animal hygiene', 'signs of animal illness', 'care for juvenile animals', 'provide nutrition to animals', 'manage cattle breeding', 'monitor livestock', 'administer treatment to animals', 'livestock reproduction', 'perform milk control', 'manage livestock', 'dispose of dead animals', 'select livestock', 'feed livestock', 'manage animal biosecurity', 'assist animal birth', 'maintain professional records', 'operate farm equipment', 'monitor the welfare of animals', 'assist in transportation of animals', 'milk animals', 'animal welfare legislation', 'manage the health and welfare of livestock', 'control animal movement', 'administer specific drugs to facilitate breeding', 'animal nutrition', 'provide first aid to animals', 'health and safety regulations']
***** ALTERNATIVE LABELS: ['cattle specialist', 'cattle breeders', 'cattle rearer']
***** JOB DESCRIPTION: Cattle breeders oversee t

In [15]:
# Preprocessing of job descriptions

def preprocessing(sentence):
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    return sentence

In [16]:
print(preprocessing(job_description))

cattle breeders oversee production daytoday care cattle maintain health welfare cattle


In [17]:
df_occupations.head()

Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,technical and operations director\nhead of tec...,,released,2016-07-05T13:58:41Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,Technical directors realise the artistic visio...,2654.1.7
1,Occupation,http://data.europa.eu/esco/occupation/000e93a3...,8121,metal drawing machine operator,metal drawing machine operator\nmetal drawing ...,,released,2016-07-05T17:09:43Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Metal drawing machine operators set up and ope...,8121.4
2,Occupation,http://data.europa.eu/esco/occupation/0019b951...,7543,precision device inspector,inspector of precision instruments\nprecision ...,,released,2016-07-06T09:21:20Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,Precision device inspectors make sure precisio...,7543.10.3
3,Occupation,http://data.europa.eu/esco/occupation/0022f466...,3155,air traffic safety technician,air traffic safety electronics hardware specia...,,released,2017-01-17T11:40:37Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Air traffic safety technicians provide technic...,3155.1
4,Occupation,http://data.europa.eu/esco/occupation/002da35b...,2431,hospitality revenue manager,hospitality revenues manager\nyield manager\nh...,,released,2017-01-17T13:33:42Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Hospitality revenue managers maximise revenue ...,2431.9


In [53]:
# get an training set with 2500 out of the whole 3006 rows and preprocess every description in it

X_train = df_occupations['description'][0:2500]
X_test = df_occupations['description'][2500:3006]
X_train_proc = [preprocessing(description) for description in X_train]
X_test_proc = [preprocessing(description) for description in X_test]

In [83]:
# initialize 'job2vec' model

training_processed_descriptions = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train_proc)]
job2vec_model = Doc2Vec(training_processed_descriptions, vector_size=50, min_count=2, epochs=40)

job2vec_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x12e6837f0>

In [84]:
# build a vocabulary
job_vocab = job2vec_model.build_vocab(training_processed_descriptions)

In [87]:
print(job_vocab)

None


In [95]:
# testing_processed_descriptions = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_test_proc)]
inferred_vector = job2vec_model.infer_vector(X_test_proc)

In [96]:
inferred_vector

array([ 1.3377309e-04, -1.6938985e-03,  5.7178629e-03, -2.7272559e-03,
        9.5359323e-04, -5.4032924e-03,  4.1357218e-03,  5.2609490e-03,
       -6.5364421e-04,  4.8915972e-03,  2.7635919e-03,  3.0959880e-03,
        5.3244974e-03,  1.3804602e-03,  8.3545269e-03, -3.7680744e-04,
        4.5233085e-03, -8.9936340e-03, -2.0811516e-03,  3.2573044e-03,
       -3.0347181e-03, -1.5728968e-03,  5.3128316e-03,  5.2512479e-03,
        5.2782441e-03,  5.4466045e-03, -8.7963464e-04,  7.3031960e-03,
       -5.6774314e-03, -2.4575549e-03,  8.5622827e-03,  3.1659734e-03,
       -9.7021721e-03,  8.4357886e-03,  4.3836534e-03, -5.7347380e-03,
        8.5078552e-03,  7.4873162e-03, -5.3934343e-03,  5.5024622e-04,
        2.2116899e-05,  1.8674410e-03,  8.0494955e-03, -2.5348843e-04,
       -9.8389536e-03, -8.6924015e-04, -3.0226065e-03, -3.1232948e-03,
       -8.7056132e-03, -6.8409299e-03], dtype=float32)