In [11]:
import pandas as pd
import numpy as np
import string
import random
import pickle
import csv
from tqdm import tqdm
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
############################
### describe your job!!! ###
############################

describe_your_job = """
Management Consulting in FREESCG - Food Consultant - Management consultancy specialized in food & beverage - Consulting services for restaurants, bars and similar - Project developed with specialization to each client through a diagnosis and an action plan put into practice, with analysis of the obtained results - Areas of
"""


##################################
### get 'n' job suggeestion!!! ###
##################################

n_jobs = 10


In [4]:
# get dataframes from CSV files

df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_skills.csv')

with open('../data/ESCO/data_science_skills_ESCO.csv', newline='') as f:
    reader = csv.reader(f)
    csv_data = list(reader)
    ds_keywords = [item for sublist in csv_data for item in sublist]

  df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_skills.csv')


In [5]:
# filter unneeded columns of of the dataframe and add needed ones

df_occ_n_skills = df_occ_n_skills.filter(items=['preferredLabel', 'description', 'skills'])
df_occ_n_skills = df_occ_n_skills.reindex(columns=['preferredLabel','description', 'skills'])
df_occ_n_skills.rename(columns={'preferredLabel': 'job_title'}, inplace=True)
df_occ_n_skills['description_input'] = 0
df_occ_n_skills['skills_input'] = 0

In [6]:
# create description_input and skills_input, which are strings on which the model will be fit

for row, index in tqdm(df_occ_n_skills.iterrows()):
    underscored_job_title = index['job_title'].replace(" ", "_")
    this_rows_description_input = underscored_job_title + ' ' + index['description']
    this_rows_skills_input = underscored_job_title + ' ' + index['skills']
    df_occ_n_skills.iloc[row,-2] = this_rows_description_input
    df_occ_n_skills.iloc[row,-1] = this_rows_skills_input

35824it [00:06, 5447.66it/s]


In [7]:
# inspect dataframe

df_occ_n_skills.head()

Unnamed: 0,job_title,description,skills,description_input,skills_input
0,technical director,Technical directors realise the artistic visio...,"adapt to artists' creative demands, organise r...",technical_director Technical directors realise...,technical_director adapt to artists' creative ...
1,metal drawing machine operator,Metal drawing machine operators set up and ope...,"cold drawing processes, monitor moving workpie...",metal_drawing_machine_operator Metal drawing m...,metal_drawing_machine_operator cold drawing pr...
2,precision device inspector,Precision device inspectors make sure precisio...,"precision measuring instruments, monitor machi...",precision_device_inspector Precision device in...,precision_device_inspector precision measuring...
3,air traffic safety technician,Air traffic safety technicians provide technic...,"air transport law, aircraft flight control sys...",air_traffic_safety_technician Air traffic safe...,air_traffic_safety_technician air transport la...
4,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,"develop revenue generation strategies, ensure ...",hospitality_revenue_manager Hospitality revenu...,hospitality_revenue_manager develop revenue ge...


In [11]:
df_occ_n_skills.iloc[0,4]

"technical_director adapt to artists' creative demands, organise rehearsals, theatre techniques, promote health and safety, coordinate technical teams in artistic productions, coordinate with creative departments, negotiate health and safety issues with third parties, write risk assessment on performing arts production"

In [7]:
# get entire dataframe as dataset according to Qiewi's suggestion
# concatenate the job_title:skills list to the end of df_occ_n_skills

X_all = pd.concat([df_occ_n_skills['description_input'], df_occ_n_skills['skills_input']]).reset_index(drop=True)

In [8]:
# applying preprocessing to corpus as in Gensim tutorial, also applying Phraser

def read_corpus(corpus):
   
    # instantiate Phraser outside of the loop
    sentence_stream = [entry.split(" ") for entry in corpus]
    bigrams = Phrases(
        sentence_stream,
        min_count=5,
        threshold=5,
        connector_words=ENGLISH_CONNECTOR_WORDS
        )
    
    for i, line in enumerate(corpus):    
    
        # remove punctuation
        for punctuation in string.punctuation:        
            sentence = line.replace(punctuation, '')

        # remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(sentence)
        stopword_free_tokens = [token for token in tokens if token not in stop_words]
        sentence = ' '.join(stopword_free_tokens)

        # lemmatize
        sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
        sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
        
        # get bigrams
        sent = sentence.split()

        # yield tagged final corpus
        yield TaggedDocument(bigrams[sent], [i])

all_corpus = list(read_corpus(X_all))

In [25]:
# Preprocessing function for job descriptions

def preprocess_input(sentence, ds_insert=False, ds_insert_ratio=0.2):
    
    # insert data science keywords if ds_insert==True
    
    if ds_insert == True:
        
        sentence_splitted = sentence.split()
            
        if len(sentence_splitted) * ds_insert_ratio >= len(ds_keywords):
            print('Warning: Chosen ratio is using up all available Data Science keywords!')
        
        insertion_amount = int(len(sentence_splitted) * ds_insert_ratio)
        insertion_counter = 0

        for insertion in range(insertion_amount):
            if len(sentence_splitted) * ds_insert_ratio <= len(ds_keywords):
                sentence_splitted.append(ds_keywords[insertion_counter])
                insertion_counter += 1
            
        sentence = ' '.join(sentence_splitted)    
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    # split into tokens again after Lemmatizing --- this was replaced by Phraser 
    # sentence = word_tokenize(sentence)
    
    return sentence

In [33]:
def truncate_description(job_description, no_words):
    return ' '.join(job_description.split()[:no_words:1])

In [10]:
# apply preprocessing

new_description = preprocess_input(describe_your_job)

In [11]:
# load saved BERT model and its embedding from disk

filename = '../model/bert_model.sav'
bert_model = pickle.load(open(filename, 'rb'))

filename = '../model/all_corpus_embed.sav'
all_corpus_embed = pickle.load(open(filename, 'rb'))

In [12]:
# run model

# run input through the model
new_description_embed = bert_model.encode(new_description)

# calculate and order cosine similarity
similarity_rank = cosine_similarity([new_description_embed], all_corpus_embed)
similarity_rank_index = np.argsort(similarity_rank[0])[::-1]

In [13]:
print(f'TEST DOCUMENT: {new_description} \n')
print('LISTING 10 MOST SIMILAR JOB ROLES & DESCRIPTIONS \n')

for i in range(n_jobs):
    
    if similarity_rank_index[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index[i]
    else:
        new_index = similarity_rank_index[i] - len(df_occ_n_skills)

    print(f'RANK #{i+1}: ' + df_occ_n_skills.loc[new_index]['job_title'])
    print(df_occ_n_skills.loc[new_index]['description'])
    print(f'Similarity score: {round(similarity_rank[0][similarity_rank_index[i]]*100,1)} %')
    print(f'Index in dataframe: {new_index} \n')



TEST DOCUMENT: management consulting freescg food consultant management consultancy specialized food beverage consulting services restaurants bars similar project developed specialization client diagnosis action plan put practice analysis obtained results areas expertise financial logistics human resources marketing administrative backoffice production operation analysis management purchasing logistics madpizza evaluation restructuring monitoring existing logistics department establish twoway communication procedures emphasis motivation teamwork consumption forecast supplier assessment prospecting monitoring settle divergences create quick solutions creation new work tools supply chain assessment identification improvement points supervision stock orders cost center order reduce inventory costs order validation point sale ensure permanent inventory maintenance controlling stock levels stock replenishment food beverage controller food beverage performance monitoring cost control invento

In [38]:
### result comparison between no DS keywords and DS keywords ###

# insert the ratio you want for the insertion of data science keywords (value between 0 and 1)
ds_ratio = 0.5

# how many words do you want to truncate to?
describe_your_job = truncate_description(describe_your_job, 50)

# preprocess
new_description_no_ds = preprocess_input(describe_your_job)
new_description_w_ds = preprocess_input(describe_your_job, ds_insert=True, ds_insert_ratio=ds_ratio)

print("Original text: " + new_description_no_ds)
print("")
print("Text w/ DS KW: " + new_description_w_ds)
print("")


# run input through the model
new_description_no_ds_embed = bert_model.encode(new_description_no_ds)
new_description_w_ds_embed = bert_model.encode(new_description_w_ds)

# calculate and order cosine similarity
similarity_rank_no_ds = cosine_similarity([new_description_no_ds_embed], all_corpus_embed)
similarity_rank_index_no_ds = np.argsort(similarity_rank_no_ds[0])[::-1]
similarity_rank_w_ds = cosine_similarity([new_description_w_ds_embed], all_corpus_embed)
similarity_rank_index_w_ds = np.argsort(similarity_rank_w_ds[0])[::-1]

# show results
print('~~~RESULT COMPARISON~~~ \n')

for i in range(n_jobs):
    
    if similarity_rank_index_no_ds[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index_no_ds[i]
    else:
        new_index = similarity_rank_index_no_ds[i] - len(df_occ_n_skills)

    print(f'RANK #{i+1}:')
    print('ORIGINAL: ' + df_occ_n_skills.loc[new_index]['job_title'])

    if similarity_rank_index_w_ds[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index_w_ds[i]
    else:
        new_index = similarity_rank_index_w_ds[i] - len(df_occ_n_skills)
    
    print('W/ DS KW: ' + df_occ_n_skills.loc[new_index]['job_title'])
    print('')

Original text: management consulting freescg food consultant management consultancy specialized food beverage consulting services restaurants bars similar project developed specialization client diagnosis action plan put practice analysis obtained results areas

Text w/ DS KW: management consulting freescg food consultant management consultancy specialized food beverage consulting services restaurants bars similar project developed specialization client diagnosis action plan put practice analysis obtained results areas machine learning algorithms deep learning algorithms statistics data science data analysis business intelligence use databases probability statistics data visualization data processing collect ict data data models data mining analyse big data normalize data query languages statistical analysis techniques online analytical processing handle data samples visual presentation techniques report analysis results design database scheme information extraction findable interopera

In [12]:
    # insert data science keywords if ds_insert==True
    
    if ds_insert == True:
        
        sentence_splitted = sentence.split()
        
        insertion_amount = len(sentence_splitted)

        for insertion in range(insertion_amount):
            sentence_splitted.append(ds_keywords[insertion])
            
        sentence = ' '.join(sentence_splitted)   

NameError: name 'ds_insert' is not defined