## Matching jobs titles to resumes
The challenges in this mini project:<br>
1. The resumes and the jobs were in two different file types.
2. They had to be loaded into the jupyter NB using different methods. 
3. Both were loaded into dictionaries
4. They were converted to dataframes and cleaned.
5. Words were vectorized and then compared using consine similarity. 

In [150]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import glob
import docx2txt
import pickle
import re
import os

In [126]:
# Load job descriptions and resumes into pandas dictionary
resume = {}
for file in glob.glob("/Users/dahliashamir/Documents/GitHub/NLP_project/CVs/*.docx"):
    resume[file.split('.')[0]] = docx2txt.process(file)


In [131]:
resume_new = {}
for key in resume:
    key.replace(key, key.title())
    new_key = key.lstrip('/Users/dahliashamir/Documents/GitHub/NLP_project/')
    resume[new_key] = resume.pop(key)

In [132]:
resume.keys()

dict_keys(['-Designer-Resume-Sample-MSWord-Download', 'Business-Analyst-Resume-Example-MSWord-Download', 'Electrician-Resume-Sample-MSWord-Download', 'Accounting-Resume-Sample-MSWord-Download', '-Service-Representative-Resume-Sample-MSWord-Download', 'y-hotel-front-desk-resume-sample-MSWord-download', '-Analyst-Resume-Example-MSWord-Download', 'English-Tutor-Resume-Sample-MSWord-Download', 'Shipping-and-Receiving-Clerk-Resume-Sample-MSWord-Download', 'Experienced_Full-Stack_Developer', '-stylist-resume-sample-MSWord-Download', 'Music-Resume-Sample-MSWord-Download', 'School-Bus-Driver-Resume-Sample-MSWord-Download', 'keeping-Resume-Sample-MSWord-Download', '-janitor-resume-sample-MSWord-download', '-Labor-Resume-Sample-MSWord-Download', 'Sales-Associate-Resume-Sample-MSWord-Download (1)', 'Truck-Driver-Resume-Sample-MSWord-Download', 'Flight-Attendant-Resume-MSWord-Download', 'Bank-Teller-Resume-Sample-MSWord-Download', 'Bookkeeper-Resume-Sample-MSWord-Download', '-Driver-Resume-Sample-

In [133]:
#clean resume dictionary and prepare for resume dataframe
stop_words = set(stopwords.words('english'))

def resume_dict_to_df(text):
    # make lower case
    low_case = {key: value.lower() for (key, value) in text.items()}
    low_case_key = dict((k.lower(), v) for k, v in low_case.items()) 
    # remove underscore
    no_underscore_resume = {key: value.replace('__', ' ') for (key, value) in low_case_key.items()}
    # remove white spaces
    no_wspace_resume = {key: value.replace("   ", "") for (key, value) in no_underscore_resume.items()}
    # remove \n from values
    nresume = {key: value.replace("\n", "") for (key, value) in no_wspace_resume.items()}
    #dashes_in_key
    final_resume = {key.replace("-", " "): value for (key, value) in nresume.items()}
    resume_df = pd.DataFrame(final_resume, index=[0]).T
    resume_df.reset_index(inplace = True)
    resume_df.rename(columns = {'index':'doc_names', 0: 'words'}, inplace = True)
    return resume_df

resume_df = clean_resume_dict_text(resume)
resume_df.head(2)


Unnamed: 0,doc_names,words
0,designer resume sample msword download,graphic designer resume sample(xxx)-xxx-xxxx |...
1,business analyst resume example msword download,business analyst resume sample(xxx)-xxx-xxxx |...


In [134]:
# Uploading pickle files to dictionary.
directory = '/Users/dahliashamir/Documents/GitHub/NLP_project/positions'
os.chdir(directory)
# Create empty dictionary to save data
pos_dict = {}

# Loop over files and read pickles
for file in os.listdir(directory):
    if file.endswith('.pkl') :
        with open(file, 'rb') as f:
            pos_dict[file.split('.')[0]] = pickle.load(f)

In [135]:
#pos_dict had many unnecessary detail. This was simplified to job_kw dictionary (one key and one value).
job_kw = {}
for v in pos_dict.values():
    job_kw.update({v['title'] : v['basic_qualifications']})
    
#print(job_kw['Senior UX Designer, AWS Honeycode'])

In [136]:
# Convert resume dict to resume df
def postion_dict_to_df(job_kw):
    positions_df = pd.DataFrame(job_kw, index=[0]).T
    positions_df.reset_index(inplace = True)
    #positions_df.columns
    positions_df.rename(columns = {'index':'job_title', 0: 'job_description'}, inplace = True)
    return positions_df

positions_df = postion_dict_to_df(job_kw)

In [137]:
# Clean job descriptions and resumes by removing punctuation, stop words, and converting all text to lowercase.
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = ''.join([word for word in text if word.isalnum() or word.isspace()])
    tokens = word_tokenize(text)
    text = [word for word in tokens if word not in stop_words]
    return ' '.join(text)

positions_df['clean_description'] = positions_df['job_description'].apply(lambda x: clean_text(x))
resume_df['clean_resume'] = resume_df['words'].apply(lambda x: clean_text(x))

In [149]:
# Match job descriptions to resumes using cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer()
tfidf_jobs = tfidf_vectorizer.fit_transform(positions_df['clean_description'])
tfidf_resumes = tfidf_vectorizer.transform(resume_df['clean_resume'])

matches = []
for i, resume in enumerate(tfidf_resumes):
    similarities = cosine_similarity(resume, tfidf_jobs)[0]
    job_index = similarities.argmax()
    job_title = positions_df.iloc[job_index]['job_title']
    matches.append({'resume_index': i, 'job_title': job_title})

# Print the matches
for match in matches:
    print(f"Resume {match['resume_index']} matches with job '{match['job_title']}'")
print(resume_df['clean_resume'])

Resume 0 matches with job 'EE Lead CTA'
Resume 1 matches with job 'Consumer Insights Manager, Payments Brand & Insights'
Resume 2 matches with job 'Hardware Development Manager, AWS Server Engineering'
Resume 3 matches with job 'Tax Manager, Prime Video, Digital Tax '
Resume 4 matches with job 'Cloud Support Eng. I (DEP)'
Resume 5 matches with job 'Partner Funding PM, AWS Partner Scalable GTM Migration & Funding Programs'
Resume 6 matches with job 'Business Intel Engineer'
Resume 7 matches with job 'Principal Research Scientist, Modeling and Optimization'
Resume 8 matches with job 'Warehouse Specialist, Failure Analysis'
Resume 9 matches with job 'Principal Solution Architect (SA), Application Modernization Lab, Application Modernization Lab'
Resume 10 matches with job 'Principal Business Development Lead – Greenfield GTM Strategy'
Resume 11 matches with job 'Global Latin Music Programmer, Music Industry'
Resume 12 matches with job 'Amazon Fresh Overnight Grocery Associate - Woodland H