In [154]:
import pandas as pd
import numpy as np
import requests
import string
from gensim.models.doc2vec import Doc2Vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [45]:
df_occupations = pd.read_csv('data/ESCO/occupations_en.csv')

In [86]:
# search for all jobs that have 'manager' in their title  --- IGNORE this cell for now
url_title = "https://ec.europa.eu/esco/api/search"

params = {
    'language': 'en',
    'type': 'occupation',
    'text': 'pig'
         }

api_title = requests.get(url=url, params=params).json()

In [131]:
# select a role
role = 'cattle breeder'

In [142]:
### run this cell & extract skills, alternative labels, and job description from the ESCO API ###

# get uri for selected role
role_extract = df_occupations[df_occupations['preferredLabel'] == role]
role_uri = role_extract['conceptUri']

# get role information from the ESCO API
url = "https://ec.europa.eu/esco/api/resource/skill"
params = {
    'uri': role_uri,
    'language': 'en',
         }
api_skills = requests.get(url=url, params=params).json()

# this line gets you the whole API url; easier to read on your browser
requests.get(url=url, params=params).url

# gets a list of all skill descriptions and extracts the skill title only
api_skills_dict = api_skills.get('_links').get('hasEssentialSkill')
skills_list = [skill.get('title') for skill in api_skills_dict]

# gets list of alternative labels from the API
alt_labels_list = api_skills.get('alternativeLabel').get('en')

# gets job description from the API
job_description = api_skills.get('description').get('en').get('literal')

print('***** SKILL LIST:', skills_list)
print('***** ALTERNATIVE LABELS:', alt_labels_list)
print('***** JOB DESCRIPTION:', job_description)

***** SKILL LIST: ['assist animal birth', 'health and safety regulations', 'livestock reproduction', 'manage the health and welfare of livestock', 'maintain professional records', 'manage cattle breeding', 'provide first aid to animals', 'manage animal hygiene', 'create animal records', 'manage animal biosecurity', 'perform milk control', 'manage livestock', 'dispose of dead animals', 'milk animals', 'animal welfare legislation', 'administer treatment to animals', 'feed livestock', 'care for juvenile animals', 'monitor livestock', 'animal nutrition', 'provide nutrition to animals', 'monitor the welfare of animals', 'maintain animal accommodation', 'livestock species', 'signs of animal illness', 'assist in transportation of animals', 'control animal movement', 'select livestock', 'administer specific drugs to facilitate breeding', 'operate farm equipment']
***** ALTERNATIVE LABELS: ['cattle specialist', 'cattle breeders', 'cattle rearer']
***** JOB DESCRIPTION: Cattle breeders oversee t

In [155]:
# Preprocessing of job descriptions

def preprocessing(sentence):
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    return sentence

In [156]:
print(preprocessing(job_description))

cattle breeders oversee production daytoday care cattle maintain health welfare cattle


In [None]:
# Function to embed a sentence (job description)

def embed_sentence(doc2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        