# Seeking Human Resources
We want to predict how well candidates are fit for certains roles based on their available information:
* EDA
* Modelling
    * Rank canditates based on some roles key words
    * Re-rank candidates when we change the roles key words

## EDA

### LOAD DATA

In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import spacy
import nltk
import re
import string
nlp = spacy.load("en_core_web_lg")
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize

In [150]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/hp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [151]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/hp/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [152]:
df=pd.read_csv("potential-talents.csv")
#df=df.set_index('id')
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB


In [154]:
#check data duplicates
print(df.duplicated().sum())

0


In [155]:
#remove duplicates
df.drop_duplicates(inplace=True)
df.shape

(104, 5)

In [156]:
#dropping fit col
df.drop('fit', axis=1,inplace=True)
df.shape

(104, 4)

About the data:
* data consist of 3 columns: job-title-location-connection
* job-title gives the available information of a paticular job

In [157]:
df.job_title.value_counts()

2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional                 7
Aspiring Human Resources Professional                                                                                    7
Student at Humber College and Aspiring Human Resources Generalist                                                        7
People Development Coordinator at Ryan                                                                                   6
Native English Teacher at EPIK (English Program in Korea)                                                                5
Aspiring Human Resources Specialist                                                                                      5
HR Senior Specialist                                                                                                     5
Student at Chapman University                                                                                            4
SVP, CHRO, Marke

Text preprocessing

In [158]:
#import spacy and load the language model downloaded

import spacy
nlp = spacy.load("en_core_web_lg")

In [159]:
#use this utility function to preprocess the text
#1. Remove the stop words
#2. Convert to base form using lemmatisation with lower case

def preprocess(text):
    all_reviews=list()
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    # Lowercase and lemmatise , pos='v' means verb
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(),pos='v') for token in tokens]
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    x=' '.join(keywords)
    x=' '.join([w for w in x.split() if len(w)>1]) 
    x= re.sub('\w*\d\w*','', x)#remove digits and/or words contain digit
    x= re.sub("türkiye","turkey",x)#this cleans the location column
    x= re.sub("kanada","canada",x)#this cleans the location column
    x= re.sub("amerika birleşik devletleri","united states america",x)#this cleans the location column
    all_reviews.append(x)
    return x

In [160]:
# def preprocess(text):
#     doc = nlp(text)
#     filtered_tokens = []
#     for token in doc:
#         if token.is_stop or token.is_punct or token.is_digit:
#             continue
#         filtered_tokens.append(token.lemma_.lower())
#     return ' '.join(filtered_tokens)

In [161]:
#create a new column "preprocessed_text" which store the clean form of given text [use apply and lambda function]

df['preprocessed_job_title'] = df['job_title'].apply(lambda text: preprocess(text))
df['preprocessed_location'] = df['location'].apply(lambda text: preprocess(text))
df.head()

Unnamed: 0,id,job_title,location,connection,preprocessed_job_title,preprocessed_location
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,bauer college business graduate magna cum lau...,houston texas
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,native english teacher epik english program korea,canada
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspire human resources professional,raleigh durham north carolina area
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,people development coordinator ryan,denton texas
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,advisory board member celal bayar university,i̇zmir turkey


In [162]:
#replace abreviations and uncorrect words by there real texts
df.replace({'preprocessed_job_title' : { 'chro' : 'chro chief human resources officer', 'svp' : 'svp senior vice president'
        ,'gphr' : 'gphr global professional in human resources','hris' : 'hris human resources management system'
        , 'csr' : 'csr corporate social responsibility', 'sphr' : 'sphr strategic and policy-making certification'
        , 'hr' : 'hr human resources', '|':''}}, regex=True, inplace=True)

df.replace({'preprocessed_location' : { 'kanada' : 'canada', 'türkiye':'turkey', 'amerika birleşik devletleri' : 'united states america'}}, regex=True, inplace=True)

In [163]:
df.head()

Unnamed: 0,id,job_title,location,connection,preprocessed_job_title,preprocessed_location
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,bauer college business graduate magna cum lau...,houston texas
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,native english teacher epik english program korea,canada
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,aspire human resources professional,raleigh durham north carolina area
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,people development coordinator ryan,denton texas
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,advisory board member celal bayar university,i̇zmir turkey


In [164]:
#Second utility function to preprocess words out of vocabulary
def preprocess1(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.is_digit or token.is_oov:
            continue
        filtered_tokens.append(token.lemma_.lower())
    return ' '.join(filtered_tokens)
df['preprocessed_job_title'] = df['preprocessed_job_title'].apply(lambda text: preprocess1(text))
df['preprocessed_location'] = df['preprocessed_location'].apply(lambda text: preprocess1(text))

df1 =df[['connection','preprocessed_job_title','preprocessed_location']]
# df1 =df[['preprocessed_job_title','preprocessed_location']]
df1.head()

Unnamed: 0,connection,preprocessed_job_title,preprocessed_location
0,85,bauer college business graduate magna cum laud...,houston texas
1,500+,native english teacher english program korea,canada
2,44,aspire human resource professional,raleigh durham north carolina area
3,500+,people development coordinator ryan,denton texas
4,500+,advisory board member university,turkey


In [165]:
df['preprocessed_job_title']=df1.fillna('').apply(pd.unique,1).apply(' '.join).str.rstrip(' ')
df.head()
df.drop(['job_title','location','connection',], axis=1,inplace=True)

In [166]:
df.head()

Unnamed: 0,id,preprocessed_job_title,preprocessed_location
0,1,85 bauer college business graduate magna cum l...,houston texas
1,2,500+ native english teacher english program k...,canada
2,3,44 aspire human resource professional raleigh ...,raleigh durham north carolina area
3,4,500+ people development coordinator ryan dent...,denton texas
4,5,500+ advisory board member university turkey,turkey


In [167]:
words_counts = Counter()
for i in df.preprocessed_job_title:
    for word in i.split(" "):
        words_counts[word] += 1

print('There are', len(words_counts) ,'words in the job title column')
words_counts.most_common()

There are 257 words in the job title column


[('human', 97),
 ('resource', 89),
 ('area', 49),
 ('500+', 44),
 ('', 44),
 ('aspire', 35),
 ('professional', 25),
 ('texas', 24),
 ('houston', 19),
 ('great', 18),
 ('student', 16),
 ('seek', 15),
 ('college', 14),
 ('generalist', 14),
 ('canada', 12),
 ('university', 12),
 ('specialist', 12),
 ('business', 11),
 ('management', 11),
 ('english', 10),
 ('north', 10),
 ('coordinator', 10),
 ('hr', 10),
 ('senior', 10),
 ('california', 10),
 ('atlanta', 10),
 ('carolina', 9),
 ('new', 9),
 ('york', 9),
 ('graduate', 8),
 ('raleigh', 8),
 ('durham', 8),
 ('officer', 8),
 ('85', 7),
 ('bauer', 7),
 ('magna', 7),
 ('cum', 7),
 ('laude', 7),
 ('city', 7),
 ('61', 7),
 ('position', 7),
 ('manager', 7),
 ('44', 6),
 ('people', 6),
 ('development', 6),
 ('ryan', 6),
 ('denton', 6),
 ('san', 6),
 ('system', 6),
 ('native', 5),
 ('teacher', 5),
 ('program', 5),
 ('korea', 5),
 ('1', 5),
 ('francisco', 5),
 ('bay', 5),
 ('philadelphia', 5),
 ('policy', 5),
 ('advisory', 4),
 ('board', 4),
 ('memb

In [168]:
# from math import log
# Y=words_counts.most_common()
# Y1=[(e1,log(e2)) for e1, e2 in Y ]
# print(zip(*Y1))
# plt.figure(figsize=(20,8))
# plt.scatter(*zip(*Y))
# plt.xticks(rotation=90)

The 5 most common word are **human**, **resources**,  **aspire**,  **professional** and **seek**

## Modeling

### Applying ranking with tfidf and bert transformation

1. Applying tfidf ranking

In [169]:
df1=df.copy()

In [170]:
def rank_candidates_tfidf(keyword,df,feature_name):
    #store cleaned rows into a list
    corpus=df[feature_name].tolist()
    #apply preprocessing on keyword
    keyword=preprocess(keyword)
    #create an instance of tfidfVectorizer
    tfvectoriser = TfidfVectorizer()
    #Append the cleaned keyword to th corpus list 
    corpus.append(keyword)
    #fit the Tfidf vectorizer on the cleaned data
    key_tf=tfvectoriser.fit_transform(corpus)
    #create a dataframe that contains tfidf values of each token for each row in th data
    key_df=pd.DataFrame(key_tf.toarray(), columns=tfvectoriser.get_feature_names())
    shape=key_df.shape
    #calculate the cosine similarity between the keyword and the data's rows
    cosine_sim = cosine_similarity(key_tf.toarray()[:shape[0]-1], key_tf.toarray()[shape[0]-1].reshape(1, -1))
    #Add cosine resluts into a new column fit
    df['fit_tfidf']=cosine_sim
    #store the ranked candidates based on the fit score
    final_df=df.sort_values('fit_tfidf',ascending=False)
    #remove the keyword added to reuse the function again on other different keywords
    corpus.pop()
    return final_df

 * Test the rank_candidate function

In [171]:
# pip intall -u sentence-tranformers

In [172]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import warnings
warnings.filterwarnings("ignore")
from sentence_transformers import SentenceTransformer

In [173]:
keyword='aspire human resources'
df2=rank_candidates_tfidf(keyword,df,"preprocessed_job_title")

In [174]:
#number of ranked candidates based on the keyword
print('There are',df2[df2['fit_tfidf']!=0].shape[0],'ranked candidates for the job',keyword)

There are 71 ranked candidates for the job aspire human resources


In [175]:
#Explore the new column 
df1.head(71),df1.tail(71)

(    id                             preprocessed_job_title  \
 0    1  85 bauer college business graduate magna cum l...   
 1    2  500+  native english teacher english program k...   
 2    3  44 aspire human resource professional raleigh ...   
 3    4  500+  people development coordinator ryan dent...   
 4    5      500+  advisory board member university turkey   
 ..  ..                                                ...   
 66  67  500+  human resource staff recruit professiona...   
 67  68  500+  human resource specialist great new york...   
 68  69  500+  director human resource north america gr...   
 69  70  82 retire army national guard recruiter office...   
 70  71  500+  human resource generalist inc raleigh du...   
 
                  preprocessed_location  
 0                        houston texas  
 1                               canada  
 2   raleigh durham north carolina area  
 3                         denton texas  
 4                               turkey  
 .

2. Applying bert

In [176]:
#Load the pretrained model 
bert = SentenceTransformer('all-MiniLM-L6-v2')

In [179]:
def rank_candidates_bert(keyword,df,feature_name):
    #load bert model 
    bert = SentenceTransformer('all-MiniLM-L6-v2')
    #store cleaned rows into a list
    corpus=df[feature_name].tolist()
    #apply preprocessing on keyword
    keyword=preprocess(keyword)
    #Append the cleaned keyword to th corpus list 
    corpus.append(keyword)
    #encode our sentences
    sentence_embeddings = bert.encode(corpus)
    print("the shape of our encoded sentences is",sentence_embeddings.shape)
    shape=sentence_embeddings.shape
    #calculate the cosine similarity between the keyword and the data's rows
    cosine_sim = cosine_similarity(sentence_embeddings[:shape[0]-1], sentence_embeddings[shape[0]-1].reshape(1, -1))
    #Add cosine resluts into a new column fit
    df['fit_bert']=cosine_sim
    #store the ranked candidates based on the fit score
    final_df=df.sort_values('fit_bert',ascending=False)
    #remove the keyword added to reuse the function again on other different keywords
    corpus.pop()
    return final_df

In [180]:
keyword='aspire human resources'
df2=rank_candidates_bert(keyword,df2,"preprocessed_job_title")

#number of ranked candidates based on the keyword
print('There are',df2[df2['fit_bert']!=0].shape[0],'ranked candidates for the job',keyword)

the shape of our encoded sentences is (105, 384)
There are 104 ranked candidates for the job aspire human resources


In [181]:
#Explore the new column 
df2.head(20)

Unnamed: 0,id,preprocessed_job_title,preprocessed_location,fit_tfidf,fit_bert
35,36,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471
59,60,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471
5,6,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471
48,49,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471
23,24,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471
20,21,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711
16,17,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711
32,33,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711
57,58,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711
45,46,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711


### Applying re-ranking Based on Bert Transformation

In [185]:
df2["starred"]=df2["fit_bert"]
starred_id = [int(item) for item in input("Please, inter ids of candidates you want to star?: ").split()]

Please, inter ids of candidates you want to star?: 5


In [186]:
for id in starred_id:
    df2['rerank_bert']=np.where((df2.starred >= df2['starred'][id]),1,df2.starred)

In [187]:

df2.head(20)

Unnamed: 0,id,preprocessed_job_title,preprocessed_location,fit_tfidf,fit_bert,starred,rerank_bert
35,36,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471,0.741471,1.0
59,60,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471,0.741471,1.0
5,6,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471,0.741471,1.0
48,49,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471,0.741471,0.741471
23,24,1 aspire human resource specialist great new y...,great new york city area,0.136327,0.741471,0.741471,0.741471
20,21,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711,0.656711,0.656711
16,17,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711,0.656711,0.656711
32,33,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711,0.656711,0.656711
57,58,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711,0.656711,0.656711
45,46,44 aspire human resource professional raleigh ...,raleigh durham north carolina area,0.125472,0.656711,0.656711,0.656711
