In [1]:
import pandas as pd
import numpy as np
import string
import random
import pickle
from tqdm import tqdm
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from pyresparser import ResumeParser
import re
import csv

In [2]:
def truncate_description(job_description, no_words=100):
    return ' '.join(job_description.split()[:no_words:1])

In [3]:
###DECLARE CV PATH###
path = '/home/ftraverso/code/francescotraverso/job_predictor/data/cv_directory/CV_Example.pdf'

###OPEN RESUME###
resume = ResumeParser(path).get_extracted_data()

###EXTRACT JOB EXPERIENCE###
JD =  """
Property Value Analyst, 2017 – present
Companies: Fenacam (creditoagricola.pt), Coimbra District Court, private clients and 9 other appraisal companies
- Appraisal of flats, houses, stores, warehouses, hotels and lands for investment funds, city courts, private clients and 
bank mortgage loans for almost every major bank in Portugal.
- Property data was gathered daily and treated statistically to produce appraisal reports, fueling the desire to pursue a 
path in Data Science.
- Also issued Energy Performance Certificates of flats, houses and shops.
- Created the imostudio brand (imostudio.pt) for online business promotion.
Cluster and Operations Manager, 2016 – 2018
MTD Pure Water (mtd.net)
- Coordinated design and setting up of technical solutions for temporary water and wastewater installations for the 
2016 Olympic Games in Rio de Janeiro and for 3 exhibition centers in Paris.
- Responsible for team leadership, planning, equipment requests and liaison with venue managers.
- For its delivery in the 2016 Olympics, MTD was awarded Best Operations Team at the Sports Business Awards 2017.
Site and Supervision Manager in Civil Engineering, 2006 – 2016
Companies: Pengest (pengest.pt), Lena Group (grupolena.pt), Projectual (projectual.pt) and Serrialu (serrialu.com)
- Managed and supervised construction projects from 0,5 M€ to 31 M€ in Portugal, Algeria, Angola and France.
- Responsible for team leadership, planning, works coordination, invoicing, procurement, contract management,
tendering and cost control.
- Throughout the projects, data was collected and treated to make decisions and to report financial performance.
"""
#" ".join(resume['experience'])
JD = truncate_description(JD, no_words=70)
JD



'Property Value Analyst, 2017 – present Companies: Fenacam (creditoagricola.pt), Coimbra District Court, private clients and 9 other appraisal companies - Appraisal of flats, houses, stores, warehouses, hotels and lands for investment funds, city courts, private clients and bank mortgage loans for almost every major bank in Portugal. - Property data was gathered daily and treated statistically to produce appraisal reports, fueling the desire to pursue a path in Data Science.'

In [4]:
with open('../data/skill_packages/ESCO/marketing_skills_esco.csv', newline='') as f:
    reader = csv.reader(f)
    csv_data = list(reader)
    ds_keywords = [item for sublist in csv_data for item in sublist]

ds_skills = " ".join(ds_keywords)
#ds_skills

In [5]:
############################
### describe your job!!! ###
############################

describe_your_job = JD + ds_skills

##################################
### get 'n' job suggeestion!!! ###
##################################

n_jobs = 10

describe_your_job

'Property Value Analyst, 2017 – present Companies: Fenacam (creditoagricola.pt), Coimbra District Court, private clients and 9 other appraisal companies - Appraisal of flats, houses, stores, warehouses, hotels and lands for investment funds, city courts, private clients and bank mortgage loans for almost every major bank in Portugal. - Property data was gathered daily and treated statistically to produce appraisal reports, fueling the desire to pursue a path in Data Science.identify customer requirements identify potential markets for companies respond to enquiries identify market niches carry out strategic research analyse external factors of companies use different communication channels plan marketing strategy integrate marketing strategies with the global strategy analyse internal factors of companies use theoretical marketing models maintain relationship with customers conduct research interview define technical requirements market analysis marketing principles marketing mix adver

In [6]:
# get dataframes from CSV files

df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_skills.csv')

  df_occ_n_skills = pd.read_csv('../data/ESCO/occupations_augmented_with_skills.csv')


In [7]:
# filter unneeded columns of of the dataframe and add needed ones

df_occ_n_skills = df_occ_n_skills.filter(items=['preferredLabel', 'description', 'skills'])
df_occ_n_skills = df_occ_n_skills.reindex(columns=['preferredLabel','description', 'skills'])
df_occ_n_skills.rename(columns={'preferredLabel': 'job_title'}, inplace=True)
df_occ_n_skills['description_input'] = 0
df_occ_n_skills['skills_input'] = 0

In [8]:
# create description_input and skills_input, which are strings on which the model will be fit

for row, index in tqdm(df_occ_n_skills.iterrows()):
    underscored_job_title = index['job_title'].replace(" ", "_")
    this_rows_description_input = underscored_job_title + ' ' + index['description']
    this_rows_skills_input = underscored_job_title + ' ' + index['skills']
    df_occ_n_skills.iloc[row,-2] = this_rows_description_input
    df_occ_n_skills.iloc[row,-1] = this_rows_skills_input

35824it [00:04, 7498.52it/s]


In [9]:
# inspect dataframe

df_occ_n_skills.head()

Unnamed: 0,job_title,description,skills,description_input,skills_input
0,technical director,Technical directors realise the artistic visio...,"adapt to artists' creative demands, organise r...",technical_director Technical directors realise...,technical_director adapt to artists' creative ...
1,metal drawing machine operator,Metal drawing machine operators set up and ope...,"cold drawing processes, monitor moving workpie...",metal_drawing_machine_operator Metal drawing m...,metal_drawing_machine_operator cold drawing pr...
2,precision device inspector,Precision device inspectors make sure precisio...,"precision measuring instruments, monitor machi...",precision_device_inspector Precision device in...,precision_device_inspector precision measuring...
3,air traffic safety technician,Air traffic safety technicians provide technic...,"air transport law, aircraft flight control sys...",air_traffic_safety_technician Air traffic safe...,air_traffic_safety_technician air transport la...
4,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,"develop revenue generation strategies, ensure ...",hospitality_revenue_manager Hospitality revenu...,hospitality_revenue_manager develop revenue ge...


In [10]:
# get entire dataframe as dataset according to Qiewi's suggestion
# concatenate the job_title:skills list to the end of df_occ_n_skills

X_all = pd.concat([df_occ_n_skills['description_input'], df_occ_n_skills['skills_input']]).reset_index(drop=True)

In [11]:
# applying preprocessing to corpus as in Gensim tutorial, also applying Phraser

def read_corpus(corpus):
   
    # instantiate Phraser outside of the loop
    sentence_stream = [entry.split(" ") for entry in corpus]
    bigrams = Phrases(
        sentence_stream,
        min_count=5,
        threshold=5,
        connector_words=ENGLISH_CONNECTOR_WORDS
        )
    
    for i, line in enumerate(corpus):    
    
        # remove punctuation
        for punctuation in string.punctuation:        
            sentence = line.replace(punctuation, '')

        # remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(sentence)
        stopword_free_tokens = [token for token in tokens if token not in stop_words]
        sentence = ' '.join(stopword_free_tokens)

        # lemmatize
        sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
        sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
        
        # get bigrams
        sent = sentence.split()

        # yield tagged final corpus
        yield TaggedDocument(bigrams[sent], [i])

all_corpus = list(read_corpus(X_all))

In [12]:
ds_skills

'identify customer requirements identify potential markets for companies respond to enquiries identify market niches carry out strategic research analyse external factors of companies use different communication channels plan marketing strategy integrate marketing strategies with the global strategy analyse internal factors of companies use theoretical marketing models maintain relationship with customers conduct research interview define technical requirements market analysis marketing principles marketing mix advertising techniques social media management web strategy assessment channel marketing online ads campaign techniques brand marketing techniques e-commerce systems customer service social media marketing techniques perform business research support managers draft corporate emails manage the handling of promotional materials assist in developing marketing campaigns prepare presentation material marketing management marketing department processes collaborate in the development o

In [13]:
# Preprocessing function for job descriptions

def preprocess_input(sentence, ds_insert=False, ds_insert_ratio=0.2):
    
    if ds_insert == True:
        
        sentence_splitted = sentence.split()
        
        insertion_amount = len(sentence_splitted)

        for insertion in range(insertion_amount):
            sentence_splitted.append(ds_skills[insertion])
            
        sentence = ' '.join(sentence_splitted)   
    
    # remove punctuation
    for punctuation in string.punctuation:        
        sentence = sentence.replace(punctuation, '')
    
    # set lowercase
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    stopword_free_tokens = [token for token in tokens if token not in stop_words]
    sentence = ' '.join(stopword_free_tokens)

    # lemmatize
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='n')
    sentence = WordNetLemmatizer().lemmatize(sentence, pos='v')
    
    # split into tokens again after Lemmatizing --- this was replaced by Phraser 
    # sentence = word_tokenize(sentence)
    
    # insert data science keywords if ds_insert==True
    
    
    return sentence

In [14]:
# ds_keywords

In [15]:
new_description = preprocess_input(describe_your_job,ds_insert=True)
# new_description

In [16]:
!pwd

/home/ftraverso/code/francescotraverso/job_predictor/notebooks


In [17]:
!ls

 all_corpus_embed_22.sav		'francesco -  scratch notebook.ipynb'
 bert_model_22.sav			 function_join.ipynb
 CSV_enhancer.ipynb			 Local_API_extractor_fran.ipynb
 CV_FeatureExtractor_AllCVs.ipynb	 Local_API_extractor.ipynb
'CV_FeatureExtractor_Cv Example.ipynb'	 model_DS_flare.ipynb
 doc2vec_exploration_fran.ipynb		 model_tuning_fran_3.ipynb
 ESCO_exploration.ipynb			 testing.ipynb
 ESCO_exploration_latest_fran.ipynb	 working_model_fran.ipynb
 ESCO_exploration_latest.ipynb		 working_model.ipynb
 feature_extractor_fran.ipynb


In [19]:
# load saved BERT model and its embedding from disk

filename = '../model/bert_model.sav'
bert_model = pickle.load(open(filename, 'rb'))

filename = '.../model/all_corpus_embed.sav'
all_corpus_embed = pickle.load(open(filename, 'rb'))

ModuleNotFoundError: No module named 'torch._C._nn'; 'torch._C' is not a package

In [None]:
# run model

# run input through the model
new_description_embed = bert_model.encode(new_description)

# calculate and order cosine similarity
similarity_rank = cosine_similarity([new_description_embed], all_corpus_embed)
similarity_rank_index = np.argsort(similarity_rank[0])[::-1]



In [None]:
### result comparison between no DS keywords and DS keywords ###

# insert the ratio you want for the insertion of data science keywords (value between 0 and 1)
ds_ratio = 0.1

# preprocess
new_description_no_ds = preprocess_input(describe_your_job)
new_description_w_ds = preprocess_input(describe_your_job, ds_insert=True, ds_insert_ratio=ds_ratio)

print(new_description_no_ds)
print(new_description_w_ds)


# run input through the model
new_description_no_ds_embed = bert_model.encode(new_description_no_ds)
new_description_w_ds_embed = bert_model.encode(new_description_w_ds)

# calculate and order cosine similarity
similarity_rank_no_ds = cosine_similarity([new_description_no_ds_embed], all_corpus_embed)
similarity_rank_index_no_ds = np.argsort(similarity_rank_no_ds[0])[::-1]
similarity_rank_w_ds = cosine_similarity([new_description_w_ds_embed], all_corpus_embed)
similarity_rank_index_w_ds = np.argsort(similarity_rank_w_ds[0])[::-1]

# show results
print('~~~RESULT COMPARISON~~~ \n')

for i in range(n_jobs):
    
    if similarity_rank_index_no_ds[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index_no_ds[i]
    else:
        new_index = similarity_rank_index_no_ds[i] - len(df_occ_n_skills)

    print(f'RANK #{i+1}:')
    print('ORIGINAL: ' + df_occ_n_skills.loc[new_index]['job_title'])

    if similarity_rank_index_w_ds[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index_w_ds[i]
    else:
        new_index = similarity_rank_index_w_ds[i] - len(df_occ_n_skills)
    
    print('W/ DS KW: ' + df_occ_n_skills.loc[new_index]['job_title'])
    print('')

In [None]:
print(f'TEST DOCUMENT: {new_description} \n')
print('LISTING 10 MOST SIMILAR JOB ROLES & DESCRIPTIONS \n')

for i in range(n_jobs):
    
    if similarity_rank_index[i] <= len(df_occ_n_skills):
        new_index = similarity_rank_index[i]
    else:
        new_index = similarity_rank_index[i] - len(df_occ_n_skills)

    print(f'RANK #{i+1}: ' + df_occ_n_skills.loc[new_index]['job_title'])
    print(df_occ_n_skills.loc[new_index]['description'])
    print(f'Similarity score: {round(similarity_rank[0][similarity_rank_index[i]]*100,1)} %')
    print(f'Index in dataframe: {new_index} \n')

