In [10]:
import pandas as pd
import numpy as np
import string
import random
import pickle
from tqdm import tqdm
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import csv

In [45]:
############################
### describe your job!!! ###
############################

describe_your_job = """
Property Value Analyst, 2017 – present
Companies: Fenacam (creditoagricola.pt), Coimbra District Court, private clients and 9 other appraisal companies
- Appraisal of flats, houses, stores, warehouses, hotels and lands for investment funds, city courts, private clients and
bank mortgage loans for almost every major bank in Portugal.
- Property data was gathered daily and treated statistically to produce appraisal reports, fueling the desire to pursue a
path in Data Science.
- Also issued Energy Performance Certificates of flats, houses and shops.
- Created the imostudio brand (imostudio.pt) for online business promotion.
Cluster and Operations Manager, 2016 – 2018
MTD Pure Water (mtd.net)
- Coordinated design and setting up of technical solutions for temporary water and wastewater installations for the
2016 Olympic Games in Rio de Janeiro and for 3 exhibition centers in Paris.
- Responsible for team leadership, planning, equipment requests and liaison with venue managers.
- For its delivery in the 2016 Olympics, MTD was awarded Best Operations Team at the Sports Business Awards 2017.
Site and Supervision Manager in Civil Engineering, 2006 – 2016
Companies: Pengest (pengest.pt), Lena Group (grupolena.pt), Projectual (projectual.pt) and Serrialu (serrialu.com)
- Managed and supervised construction projects from 0,5 M€ to 31 M€ in Portugal, Algeria, Angola and France.
- Responsible for team leadership, planning, works coordination, invoicing, procurement, contract management,
tendering and cost control.
- Throughout the projects, data was collected and treated to make decisions and to report financial performance.
"""


##################################
### get 'n' job suggeestion!!! ###
##################################

n_jobs = 5


In [4]:
# get dataframes from CSV files

df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_OPTIONAL_SKILLS.csv')

  df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_OPTIONAL_SKILLS.csv')


In [49]:
# get target skills

with open('../data/skill_packages/ESCO/data_science_skills_esco.csv', newline='') as f:
    reader = csv.reader(f)
    csv_data = list(reader)
    ds_skills = [item for sublist in csv_data for item in sublist]

In [5]:
# filter unneeded columns of of the dataframe and add needed ones

df_occ_n_skills = df_occ_n_skills.filter(items=['preferredLabel', 'description', 'skills', 'opt_skills'])
df_occ_n_skills = df_occ_n_skills.reindex(columns=['preferredLabel','description', 'skills', 'opt_skills'])
df_occ_n_skills.rename(columns={'preferredLabel': 'job_title'}, inplace=True)
df_occ_n_skills['description_input'] = 0
df_occ_n_skills['skills_input'] = 0

In [6]:
# create description_input and skills_input, which are strings on which the model will be fit

for row, index in tqdm(df_occ_n_skills.iterrows()):
    underscored_job_title = index['job_title'].replace(" ", "_")
    this_rows_description_input = underscored_job_title + ' ' + index['description']
    if type(index['opt_skills']) != str:
        index['opt_skills'] = ''
    this_rows_skills_input = underscored_job_title + ' ' + index['skills'] + ', ' + index['opt_skills']
    df_occ_n_skills.iloc[row,-2] = this_rows_description_input
    df_occ_n_skills.iloc[row,-1] = this_rows_skills_input

35824it [00:06, 5287.83it/s]


In [7]:
# inspect dataframe

df_occ_n_skills.head()

Unnamed: 0,job_title,description,skills,opt_skills,description_input,skills_input
0,technical director,Technical directors realise the artistic visio...,"adapt to artists' creative demands, promote he...","ESCO Occupations, ESCO member occupations",technical_director Technical directors realise...,technical_director adapt to artists' creative ...
1,metal drawing machine operator,Metal drawing machine operators set up and ope...,"set up the controller of a machine, monitor mo...","safely handle metal wire under tension, consul...",metal_drawing_machine_operator Metal drawing m...,metal_drawing_machine_operator set up the cont...
2,precision device inspector,Precision device inspectors make sure precisio...,"troubleshoot, communicate test results to othe...","microprocessors, circuit diagrams, mechanical ...",precision_device_inspector Precision device in...,"precision_device_inspector troubleshoot, commu..."
3,air traffic safety technician,Air traffic safety technicians provide technic...,"aircraft flight control systems, electronics, ...","electrical engineering, perform aircraft maint...",air_traffic_safety_technician Air traffic safe...,air_traffic_safety_technician aircraft flight ...
4,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,"think analytically, monitor financial accounts...","quote prices, maintain customer service, coach...",hospitality_revenue_manager Hospitality revenu...,hospitality_revenue_manager think analytically...


In [8]:
# get entire dataframe as dataset according to Qiewi's suggestion
# concatenate the job_title:skills list to the end of df_occ_n_skills

X_all = pd.concat([df_occ_n_skills['description_input'], df_occ_n_skills['skills_input']]).reset_index(drop=True)

In [9]:
# applying preprocessing to corpus as in Gensim tutorial, also applying Phraser

def read_corpus(corpus):
   
    # instantiate Phraser outside of the loop
    sentence_stream = [entry.split(" ") for entry in corpus]
    bigrams = Phrases(
        sentence_stream,
        min_count=5,
        threshold=5,
        connector_words=ENGLISH_CONNECTOR_WORDS
        )
    
    for i, line in enumerate(corpus):    
    
        # remove punctuation
        for punctuation in string.punctuation:        
            sentence = line.replace(punctuation, '')

        # remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(sentence)
        stopword_free_tokens = [token for token in tokens if token not in stop_words]
        sentence = ' '.join(stopword_free_tokens)

        # lemmatize
        sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
        sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
        
        # get bigrams
        sent = sentence.split()

        # yield tagged final corpus
        yield TaggedDocument(bigrams[sent], [i])

all_corpus = list(read_corpus(X_all))

In [56]:
# Preprocessing function for job descriptions

def truncate_description(job_description, no_words=100):
    return ' '.join(job_description.split()[:no_words:1])

def preprocess_input(sentence, ds_insert=False, ds_insert_ratio=0.2):
    
    if ds_insert == True:
        
        sentence_splitted = sentence.split()
        
        insertion_amount = int(len(ds_skills) * ds_insert_ratio)

        for insertion in range(insertion_amount):
            sentence_splitted.append(ds_skills[-1*insertion])
            
        sentence = ' '.join(sentence_splitted)   
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    # split into tokens again after Lemmatizing --- this was replaced by Phraser 
    # sentence = word_tokenize(sentence)
    
    # insert data science keywords if ds_insert==True
    
    
    return sentence


In [14]:
# load saved BERT model and its embedding from disk

filename = '../model/bert_model.sav'
bert_model = pickle.load(open(filename, 'rb'))

filename = '../model/all_corpus_embed.sav'
all_corpus_embed = pickle.load(open(filename, 'rb'))

In [52]:
# run model

# run input through the model
new_description_embed = bert_model.encode(new_description)

# calculate and order cosine similarity
similarity_rank = cosine_similarity([new_description_embed], all_corpus_embed)
similarity_rank_index = np.argsort(similarity_rank[0])[::-1]

In [23]:
print(f'TEST DOCUMENT: {new_description} \n')
print('LISTING 10 MOST SIMILAR JOB ROLES & DESCRIPTIONS \n')

for i in range(n_jobs):
    
    if similarity_rank_index[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index[i]
    else:
        new_index = similarity_rank_index[i] - len(df_occ_n_skills)

    print(f'RANK #{i+1}: ' + df_occ_n_skills.loc[new_index]['job_title'])
    print(df_occ_n_skills.loc[new_index]['description'])
    print(f'Similarity score: {round(similarity_rank[0][similarity_rank_index[i]]*100,1)} %')
    print(f'Index in dataframe: {new_index} \n')



TEST DOCUMENT: property value analyst – present companies fenacam creditoagricolapt coimbra district court private clients appraisal companies appraisal flats houses stores warehouses hotels lands investment funds city courts private clients bank mortgage loans almost every major bank portugal property data gathered daily treated statistically produce appraisal reports fueling desire pursue path data science 

LISTING 10 MOST SIMILAR JOB ROLES & DESCRIPTIONS 

RANK #1: private banking relationship manager
kinship banking manager retain and expand existing and prospective customer relationships. they use cross-selling technique to advise and sell various banking and financial intersection and services to customers. they also manage the total relationship with client and are responsible for optimising byplay solution and client satisfaction.
Similarity score: 71.0 %
Index in dataframe: 31545 

RANK #2: corporate investment banker
corporate investment bankers offer strategic advice on fin

In [57]:
### result comparison between no DS keywords and DS keywords ###

# insert the ratio you want for the insertion of data science keywords (value between 0 and 1)
ds_ratio = 0.2

# preprocess
describe_your_job = truncate_description(describe_your_job, no_words=70)
new_description_w_ds = preprocess_input(describe_your_job, ds_insert=True, ds_insert_ratio=ds_ratio)

print(new_description_no_ds)
print(new_description_w_ds)


# run input through the model
new_description_no_ds_embed = bert_model.encode(new_description_no_ds)
new_description_w_ds_embed = bert_model.encode(new_description_w_ds)

# calculate and order cosine similarity
similarity_rank_no_ds = cosine_similarity([new_description_no_ds_embed], all_corpus_embed)
similarity_rank_index_no_ds = np.argsort(similarity_rank_no_ds[0])[::-1]
similarity_rank_w_ds = cosine_similarity([new_description_w_ds_embed], all_corpus_embed)
similarity_rank_index_w_ds = np.argsort(similarity_rank_w_ds[0])[::-1]

# show results
print('~~~RESULT COMPARISON~~~ \n')

for i in range(n_jobs):        
    
    if similarity_rank_index_no_ds[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index_no_ds[i]
    else:
        new_index = similarity_rank_index_no_ds[i] - len(df_occ_n_skills)
        
    original_job_title = df_occ_n_skills.loc[new_index]['job_title']
    
    if 'senior' in original_job_title:
        original_job_title = original_job_title.replace('senior ', '')

    print(f'RANK #{i+1}:')
    print('ORIGINAL: ' + original_job_title)
    print(f'Similarity score: {round(similarity_rank_no_ds[0][similarity_rank_index_no_ds[i]]*100,1)} %')

    if similarity_rank_index_w_ds[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index_w_ds[i]
    else:
        new_index = similarity_rank_index_w_ds[i] - len(df_occ_n_skills)
        
    kw_job_title = df_occ_n_skills.loc[new_index]['job_title']
    
    if 'senior' in kw_job_title:
        kw_job_title = kw_job_title.replace('senior ', '')
    
    print('W/ DS KW: ' + kw_job_title)
    print(f'Similarity score: {round(similarity_rank_w_ds[0][similarity_rank_index_w_ds[i]]*100,1)} %')
    print('')

property value analyst – present companies fenacam creditoagricolapt coimbra district court private clients appraisal companies appraisal flats houses stores warehouses hotels lands investment funds city courts private clients bank mortgage loans almost every major bank portugal property data gathered daily treated statistically produce appraisal reports fueling desire pursue path data science
property value analyst – present companies fenacam creditoagricolapt coimbra district court private clients appraisal companies appraisal flats houses stores warehouses hotels lands investment funds city courts private clients bank mortgage loans almost every major bank portugal property data gathered daily treated statistically produce appraisal reports fueling desire pursue path data science review scientific data machine learning algorithms deep learning algorithms statistics data science data analysis business intelligence use databases probability statistics data visualization data processin

In [41]:
aaa = "here there be pigs"

In [42]:
aaa

'here there be pigs'

In [43]:
if 'pigs' in aaa:
    
    aaa = aaa.replace('pigs', '')

In [44]:
aaa

'here there be '