In [1]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

lemmatizer = WordNetLemmatizer()
w2v_model = KeyedVectors.load_word2vec_format('archive/GoogleNews-vectors-negative300.bin', binary=True)

In [2]:
resume_df = pd.read_excel('resume_dataset.xlsx')
jd_df = pd.read_excel('JD_dataset.xlsx')

In [3]:
resume_df.head()

Unnamed: 0,pdf_name,category,skills,education
0,10554236,ACCOUNTANT,Accounting; General Accounting; Accounts Payab...,Northern Maine Community College 1994 Associat...
1,10674770,ACCOUNTANT,"accounting, accounts payable, Accounts Receiva...","Bachelor of Science : Accounting , May 2010 Un..."
2,11163645,ACCOUNTANT,"accounts payables, accounts receivables, Accou...",Computer Applications Specialist Certificate P...
3,11759079,ACCOUNTANT,"accounting, balance sheet, budgets, client, cl...","EMORY UNIVERSITY, Goizueta Business School 5 2..."
4,12065211,ACCOUNTANT,Aderant/CMSExcelQuickBooks ProSQLAccessÂ Peach...,Bachelor of Business Administration : Accounti...


In [4]:
jd_df.head()

Unnamed: 0,company_name,job_description
0,Google,minimum qualifications\nbachelors degree or eq...
1,Apple,description\nas an asc you will be highly infl...
2,Netflix,its an amazing time to be joining netflix as w...
3,Robert Half,description\n\nweb designers looking to expand...
4,TrackFive,at trackfive weve got big goals were on a miss...


In [5]:
resume_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4968 entries, 0 to 4967
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pdf_name   4968 non-null   int64 
 1   category   4968 non-null   object
 2   skills     2316 non-null   object
 3   education  2409 non-null   object
dtypes: int64(1), object(3)
memory usage: 155.4+ KB


In [6]:
jd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     15 non-null     object
 1   job_description  15 non-null     object
dtypes: object(2)
memory usage: 368.0+ bytes


### removing null values

In [7]:
resume_df.isnull().sum()

pdf_name        0
category        0
skills       2652
education    2559
dtype: int64

In [8]:
resume_df = resume_df.dropna()
resume_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2250 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pdf_name   2250 non-null   int64 
 1   category   2250 non-null   object
 2   skills     2250 non-null   object
 3   education  2250 non-null   object
dtypes: int64(1), object(3)
memory usage: 87.9+ KB


### tokenize_preprocess

In [9]:
def tokenize_preprocess(text):
    tokens = word_tokenize(text.lower())  
    tokens = [token for token in tokens if token not in stopwords.words('english')] 
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [10]:
tokenize_cols = ['skills', 'education'] 
for column in tokenize_cols:
    new_col = f'{column}_tokens'
    resume_df[new_col] = resume_df[column].apply(tokenize_preprocess)

In [11]:
jd_df['jd_tokens'] = jd_df['job_description'].apply(tokenize_preprocess) 

### embedding 

In [12]:
def get_embeddings(tokens):
    embeddings = [w2v_model[token] for token in tokens if token in w2v_model]
    if embeddings:
        return np.mean(embeddings, axis=0)  
    else:
        return np.zeros(w2v_model.vector_size)  

In [13]:
resume_df['skills_embeddings'] = resume_df['skills_tokens'].apply(get_embeddings)
resume_df['education_embeddings'] = resume_df['education_tokens'].apply(get_embeddings)
jd_df['jd_embeddings'] = jd_df['jd_tokens'].apply(get_embeddings)

In [14]:
resume_df.head()

Unnamed: 0,pdf_name,category,skills,education,skills_tokens,education_tokens,skills_embeddings,education_embeddings
0,10554236,ACCOUNTANT,Accounting; General Accounting; Accounts Payab...,Northern Maine Community College 1994 Associat...,"[accounting, general, accounting, account, pay...","[northern, maine, community, college, 1994, as...","[-0.047740392, 0.06525966, 0.07109724, -0.0138...","[-0.020326272, -0.0030079377, 0.002674592, 0.0..."
1,10674770,ACCOUNTANT,"accounting, accounts payable, Accounts Receiva...","Bachelor of Science : Accounting , May 2010 Un...","[accounting, account, payable, account, receiv...","[bachelor, science, accounting, may, 2010, uni...","[0.026716717, 0.021528218, -0.046174817, 0.043...","[-0.00047751033, 0.011287914, 0.04041784, 0.09..."
2,11163645,ACCOUNTANT,"accounts payables, accounts receivables, Accou...",Computer Applications Specialist Certificate P...,"[account, payable, account, receivables, accou...","[computer, application, specialist, certificat...","[0.0036406822, 0.009838997, -0.027495688, 0.02...","[-0.04904879, -0.032334547, 0.036433294, 0.018..."
3,11759079,ACCOUNTANT,"accounting, balance sheet, budgets, client, cl...","EMORY UNIVERSITY, Goizueta Business School 5 2...","[accounting, balance, sheet, budget, client, c...","[emory, university, goizueta, business, school...","[0.0153676085, 0.031360827, -0.0077594956, 0.0...","[0.047958374, -0.03894806, 0.07452488, 0.13410..."
4,12065211,ACCOUNTANT,Aderant/CMSExcelQuickBooks ProSQLAccessÂ Peach...,Bachelor of Business Administration : Accounti...,"[aderant/cmsexcelquickbooks, prosqlaccessâ, pe...","[bachelor, business, administration, accountin...","[-0.02050895, 0.007436717, -0.025012804, 0.021...","[-0.030963643, 0.019262696, 0.019260596, 0.067..."


In [15]:
jd_df.head()

Unnamed: 0,company_name,job_description,jd_tokens,jd_embeddings
0,Google,minimum qualifications\nbachelors degree or eq...,"[minimum, qualification, bachelor, degree, equ...","[-0.020387437, -0.028363038, -0.022050206, 0.0..."
1,Apple,description\nas an asc you will be highly infl...,"[description, asc, highly, influential, growin...","[-0.03359191, 0.03048288, -0.049520936, 0.0413..."
2,Netflix,its an amazing time to be joining netflix as w...,"[amazing, time, joining, netflix, continue, tr...","[-0.036164533, 0.022386337, -0.017780218, 0.02..."
3,Robert Half,description\n\nweb designers looking to expand...,"[description, web, designer, looking, expand, ...","[-0.023422241, -0.018634059, -0.023095967, 0.0..."
4,TrackFive,at trackfive weve got big goals were on a miss...,"[trackfive, weve, got, big, goal, mission, rev...","[0.013022461, 0.010071418, -0.003691656, 0.064..."


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5  

matching_resumes = {}

for _, jd_row in jd_df.iterrows():
    jd_embeddings = jd_row['jd_embeddings']
    resume_df['similarity'] = resume_df.apply(lambda row: cosine_similarity([jd_embeddings], [row['skills_embeddings'], row['education_embeddings']])[0][0], axis=1)
    top_matches = resume_df.nlargest(top_n, 'similarity')
    matching_resumes[jd_row['company_name']] = top_matches

In [17]:
for company_name, top_matches in matching_resumes.items():
    jd_row = jd_df[jd_df['company_name'] == company_name].iloc[0]  
    job_description = jd_row['job_description']
    
    print('*'*20, f"{company_name}", '*'*20)
    #print(f"{job_description}")
    
    for _, row in top_matches.iterrows():
        pdf_name = row['pdf_name']
        similarity_score = row['similarity']
        print(f"{pdf_name} : {similarity_score:.4f}")
    
    print("\n")  

******************** Google ********************
33381211 : 0.9277
13328680 : 0.8964
88907739 : 0.8947
13405733 : 0.8889
12488356 : 0.8883


******************** Apple ********************
67501448 : 0.8706
23032182 : 0.8564
13328680 : 0.8528
13386301 : 0.8518
18208580 : 0.8455


******************** Netflix ********************
71767359 : 0.9040
13328680 : 0.9011
19497420 : 0.8991
23673025 : 0.8963
14611516 : 0.8867


******************** Robert Half ********************
71767359 : 0.8689
29184740 : 0.8619
82929064 : 0.8578
13328680 : 0.8564
19497420 : 0.8498


******************** TrackFive ********************
71767359 : 0.8933
13964744 : 0.8866
27152464 : 0.8852
38698573 : 0.8706
33381211 : 0.8701


******************** DesignUps ********************
71767359 : 0.8635
13964744 : 0.8412
29184740 : 0.8358
26942552 : 0.8286
32067700 : 0.8283


******************** Equisolve, Inc. ********************
71767359 : 0.9220
29764492 : 0.9151
13964744 : 0.9071
19497420 : 0.8993
28628090 : 0.