In [79]:
import pandas as pd
import re
from stemming.porter2 import stem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [80]:
df = pd.read_csv("potential-talents.csv")
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


Turn the connections variable into a continuous numerical type bounded at 500

In [81]:
def map_conns(connection):
    connection = connection.strip()
    if connection[-1] == "+":
        connection = connection[:-1]
    connection = int(connection)
    return connection
df.connection = df.connection.apply(map_conns)

# Preprocessing

Split text into tokens on any non-alphanumeric character and apply stemming. Then expand out any common abbreaviations e.g. HR -> Human Resources.

In [82]:
expander = {
    "HR": "Human Resources",
    "SVP": "Senior Vice President",
    "CHRO": "Chief Human Resources Officer",
    "CSR": "Corporate Social Responsibility",
    "MES": "Manufacturing Execution System"
}

In [83]:
sw = set()
with open("sw.txt", "r") as s:
    for word in s:
        sw.add(word[:-1])
def tokenize(text):
    output = []
    for word in re.split('[^a-z\d]', text.lower()):
        if word and word not in sw:
            output.append(stem(word))
    return output

expander = {" ".join(tokenize(x)): " ".join(tokenize(y)) for (x,y) in expander.items()}

def pp(text):
    t = tokenize(text)
    for i in range(len(t)):
        t[i] = expander.get(t[i], t[i])
    return " ".join(t)

In [84]:
pp("college graduate in hr")

'colleg graduat human resourc'

In [85]:
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500,


# Resources

- https://en.wikipedia.org/wiki/Learning_to_rank

In [86]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["job_title"].apply(pp))

In [87]:
vectorizer_loc = TfidfVectorizer()
tfidf_matrix_loc = vectorizer_loc.fit_transform(df["location"].apply(pp))

In [88]:
query = [pp("Aspiring human resources")]
query_tfidf = vectorizer.transform(query)
cosine_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()
df.fit = cosine_similarities

In [89]:
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.272019
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500,0.0
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.742704
3,4,People Development Coordinator at Ryan,"Denton, Texas",500,0.0
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500,0.0


In [90]:
df.sort_values("fit", ascending=False)

Unnamed: 0,id,job_title,location,connection,fit
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.742704
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.742704
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.742704
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.742704
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.742704
...,...,...,...,...,...
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500,0.000000
17,18,People Development Coordinator at Ryan,"Denton, Texas",500,0.000000
79,80,Junior MES Engineer| Information Systems,"Myrtle Beach, South Carolina Area",52,0.000000
15,16,Native English Teacher at EPIK (English Progra...,Kanada,500,0.000000


In [91]:
import random
starred = random.randint(0, len(df))
df.iloc[starred]

id                                                           76
job_title     Aspiring Human Resources Professional | Passio...
location                                     New York, New York
connection                                                  212
fit                                                    0.207921
Name: 75, dtype: object

In [92]:
_, title, location, connection, fit = df.iloc[starred]

In [93]:
query = [pp(title)]
query_tfidf = vectorizer.transform(query)
title_score = linear_kernel(query_tfidf, tfidf_matrix).flatten()
title_score

array([0.10253363, 0.        , 0.27995104, 0.        , 0.        ,
       0.14079422, 0.08698298, 0.05671747, 0.08698298, 0.03927949,
       0.        , 0.01593981, 0.03483164, 0.10253363, 0.10253363,
       0.        , 0.27995104, 0.        , 0.10253363, 0.        ,
       0.27995104, 0.        , 0.        , 0.14079422, 0.08698298,
       0.05671747, 0.0814978 , 0.05202175, 0.0814978 , 0.05202175,
       0.10253363, 0.        , 0.27995104, 0.        , 0.        ,
       0.14079422, 0.08698298, 0.05671747, 0.08698298, 0.03927949,
       0.        , 0.01593981, 0.03483164, 0.10253363, 0.        ,
       0.27995104, 0.        , 0.        , 0.14079422, 0.08698298,
       0.05671747, 0.08698298, 0.03927949, 0.        , 0.01593981,
       0.03483164, 0.10253363, 0.27995104, 0.        , 0.14079422,
       0.05671747, 0.03927949, 0.        , 0.01593981, 0.03483164,
       0.12801924, 0.10524485, 0.04504835, 0.02613619, 0.02134454,
       0.03533021, 0.06757189, 0.11604259, 0.23545425, 0.05599

In [94]:
query = [pp(location)]
query_tfidf = vectorizer_loc.transform(query)
location_score = linear_kernel(query_tfidf, tfidf_matrix_loc).flatten()
location_score

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.71268297, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.71268297, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.71268297, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.71268297, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.71268297,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.71268297, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [95]:
scores = {
    "title_score": 0.7,
    "location_score": 0.2,
    "connection_score": 0.1
}
df.fit = title_score*scores["title_score"] + location_score*scores["location_score"] + df.connection*scores["connection_score"]

In [96]:
df.sort_values("fit", ascending=False)

Unnamed: 0,id,job_title,location,connection,fit
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500,50.174070
66,67,"Human Resources, Staffing and Recruiting Profe...","Jackson, Mississippi Area",500,50.073671
28,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500,50.057048
26,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500,50.057048
37,38,HR Senior Specialist,San Francisco Bay Area,500,50.039702
...,...,...,...,...,...
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.341093
62,63,Student at Chapman University,"Lake Forest, California",2,0.200000
53,54,Student at Chapman University,"Lake Forest, California",2,0.200000
10,11,Student at Chapman University,"Lake Forest, California",2,0.200000


# Summary

This project concerned an NLP similarity searching task. For the initial stage, after preprocessing both the query and the job title text by splitting on non-alphanumeric characters, stemming and expanding abbreviations, we create vector embeddings of the text using TfIdf. Other vector space models that were considered are GloVe and Word2Vec. We calculate the cosine similarity between the query vector and the vectors of every job title in the data and sort on this to obtain the most relevant candidates. For the second component, we select a candidate at random whom we pretend is the one selected as the best fit. To rerank the candidates, we run the same similarity search, with the query being the title of the chosen candidate. We also run the same TfIdf similarity search with the locations to try and obtain candidates whose locations match as much as possible. Finally we also consider the number of connections each candidate has as the larger the number of connections, the greater the candidate's network and influence, something which is especially useful for an HR role. These 3 scores are combined into 1 using a linear combination weighted 0.7, 0.2, 0.1 respectively. These weights are free to change as per project / domain requirements. We can filter out candidates who should not be on this list by taking out those candidates with a score of 0 in the initial stage.