In [23]:
from tqdm import tqdm

In [1]:
import pandas as pd
import os
import re
import requests
from  gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import string

In [2]:
!ls ../data/ESCO/occupations_en.csv

../data/ESCO/occupations_en.csv


In [3]:
!pwd

/home/ftraverso/code/francescotraverso/job_predictor/notebooks


In [4]:
df = pd.read_csv('../data/ESCO/occupations_en.csv')

In [5]:
df_only_JT = df.drop(columns=['conceptType', 'conceptUri', 'iscoGroup', 'status', 'modifiedDate', 'regulatedProfessionNote',
       'scopeNote', 'definition', 'inScheme','hiddenLabels'])

In [6]:
df_only_JT

Unnamed: 0,preferredLabel,altLabels,description,code
0,technical director,technical and operations director\nhead of tec...,Technical directors realise the artistic visio...,2654.1.7
1,metal drawing machine operator,metal drawing machine operator\nmetal drawing ...,Metal drawing machine operators set up and ope...,8121.4
2,precision device inspector,inspector of precision instruments\nprecision ...,Precision device inspectors make sure precisio...,7543.10.3
3,air traffic safety technician,air traffic safety electronics hardware specia...,Air traffic safety technicians provide technic...,3155.1
4,hospitality revenue manager,hospitality revenues manager\nyield manager\nh...,Hospitality revenue managers maximise revenue ...,2431.9
...,...,...,...,...
3003,demographer,demography research analyst\ndemography studie...,Demographers study a variety of parameters rel...,2120.2
3004,sorter labourer,sorter laborer\ngrader\nyard labourer\nrecycle...,Sorter labourers sort recyclable materials and...,9612.2
3005,armoured car guard,armoured truck escort\ntruck escort\narmored c...,Armoured car guards ensure the safe transporta...,5414.1.2
3006,civil service administrative officer,government administrative officer\ncivil servi...,Civil service administrative officers perform ...,2422.1


In [17]:
df_only_JT['altLabels'][0]

'technical and operations director\nhead of technical\ndirector of technical arts\nhead of technical department\ntechnical supervisor\ntechnical director\ntechnical manager'

In [7]:
# Preprocessing of job descriptions

def preprocessing(sentence):
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    return sentence

In [8]:
# search for all jobs that have 'manager' in their title  --- IGNORE this cell for now
url_title = "https://ec.europa.eu/esco/api/search"

params = {
    'language': 'en',
    'type': 'occupation',
    'text': 'pig'
         }

api_title = requests.get(url=url_title, params=params).json()

In [9]:
# select a role
role = 'cattle breeder'

In [11]:
# get an training set with 2500 out of the whole 3006 rows and preprocess every description in it

X_train = df['description'][0:2500]
X_test = df['description'][2500:3006]
X_train_proc = [preprocessing(description) for description in X_train]
X_test_proc = [preprocessing(description) for description in X_test]

In [12]:
# initialize 'job2vec' model

training_processed_descriptions = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train_proc)]
job2vec_model = Doc2Vec(training_processed_descriptions, vector_size=50, min_count=2, epochs=40)

job2vec_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7f38ac13ab50>

In [13]:
# build a vocabulary
job_vocab = job2vec_model.build_vocab(training_processed_descriptions)

In [14]:
job_vocab

In [15]:
# testing_processed_descriptions = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_test_proc)]
inferred_vector = job2vec_model.infer_vector(X_test_proc)
inferred_vector

array([ 0.00489755,  0.00529026,  0.00414103,  0.00521051,  0.00705354,
        0.00542409,  0.00978128,  0.00136568, -0.00460378, -0.00343036,
        0.00070894, -0.00676497, -0.00137454, -0.00655193, -0.00416098,
       -0.00266913, -0.00232818,  0.00821303, -0.00053216, -0.00391883,
       -0.00969381,  0.00620907,  0.00463866, -0.0084107 , -0.00287951,
        0.00128499,  0.00239346,  0.00860116,  0.00700837, -0.00172578,
        0.00438815, -0.00854222,  0.00567825,  0.00517601,  0.00763484,
       -0.00865479,  0.00338612, -0.00787145,  0.00117944, -0.0067898 ,
       -0.00141786,  0.00909927,  0.00722282,  0.00011656,  0.00924594,
        0.00139886, -0.00575996,  0.00686854,  0.00573933, -0.0044042 ],
      dtype=float32)

In [18]:
# select a role
role = 'cattle breeder'

In [22]:
### run this cell & extract skills, alternative labels, and job description from the ESCO API ###

# get uri for selected role
role_extract = df[df_occupations['preferredLabel'] == role]
role_uri = role_extract['conceptUri']

# get role information from the ESCO API
url = "https://ec.europa.eu/esco/api/resource/skill"
params = {
    'uri': role_uri,
    'language': 'en',
         }
api_skills = requests.get(url=url, params=params).json()

# this line gets you the whole API url; easier to read on your browser
requests.get(url=url, params=params).url

# gets a list of all skill descriptions and extracts the skill title only
api_skills_dict = api_skills.get('_links').get('hasEssentialSkill')
skills_list = [skill.get('title') for skill in api_skills_dict]

# gets list of alternative labels from the API
alt_labels_list = api_skills.get('alternativeLabel').get('en')

# gets job description from the API
job_description = api_skills.get('description').get('en').get('literal')

print('***** SKILL LIST:', skills_list)
print('***** ALTERNATIVE LABELS:', alt_labels_list)
print('***** JOB DESCRIPTION:', job_description)
#***** SKILL LIST: ['monitor the welfare of animals', 'administer specific drugs to facilitate breeding', 'care for juvenile animals', 'operate farm equipment', 'provide first aid to animals', 'animal nutrition', 'signs of animal illness', 'feed livestock', 'manage animal hygiene', 'maintain professional records', 'perform milk control', 'maintain animal accommodation', 'select livestock', 'manage livestock', 'create animal records', 'assist in transportation of animals', 'manage the health and welfare of livestock', 'dispose of dead animals', 'control animal movement', 'monitor livestock', 'manage cattle breeding', 'livestock reproduction', 'provide nutrition to animals', 'milk animals', 'health and safety regulations', 'administer treatment to animals', 'manage animal biosecurity', 'animal welfare legislation', 'assist animal birth', 'livestock species']
#***** ALTERNATIVE LABELS: ['cattle specialist', 'cattle breeders', 'cattle rearer']
#***** JOB DESCRIPTION: Cattle breeders oversee the production and day-to-day care of cattle. They maintain the health and welfare of cattle.
# get an training set with 2500 out of the whole 3006 rows 

# X_all = df_occupations['description'][0:4875]
# X_train = X_all[0:2500]
# X_test = X_all[2500:3006]

***** SKILL LIST: ['manage the health and welfare of livestock', 'manage cattle breeding', 'feed livestock', 'dispose of dead animals', 'manage animal biosecurity', 'milk animals', 'provide nutrition to animals', 'care for juvenile animals', 'manage animal hygiene', 'administer specific drugs to facilitate breeding', 'maintain animal accommodation', 'health and safety regulations', 'manage livestock', 'signs of animal illness', 'livestock species', 'operate farm equipment', 'maintain professional records', 'animal nutrition', 'administer treatment to animals', 'monitor livestock', 'select livestock', 'provide first aid to animals', 'animal welfare legislation', 'livestock reproduction', 'perform milk control', 'assist animal birth', 'monitor the welfare of animals', 'assist in transportation of animals', 'create animal records', 'control animal movement']
***** ALTERNATIVE LABELS: ['cattle specialist', 'cattle breeders', 'cattle rearer']
***** JOB DESCRIPTION: Cattle breeders oversee t