## Analyzing Resumes using Python Packages
In this analysis, we will leverage several Python packages to process and analyze a collection of resumes. The goal is to extract relevant information and compare the resumes based on their similarity to a given job description. 

In [16]:
import numpy as np
import pandas as pd
import os
from collections import Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PyPDF2 import PdfReader

In [17]:
cv = pd.read_csv("Resume/Resume.csv")

In [18]:
cv.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [19]:
# Get the unique categories
unique_categories = cv['Category'].unique()

# Print unique categories and their count
print(f"Unique Categories: {unique_categories}")
print(f"Number of Unique Categories: {cv['Category'].nunique()}")

Unique Categories: ['HR' 'ICT' 'SALES' 'BANKING']
Number of Unique Categories: 4


In [20]:
first_resume_str = cv.iloc[0]['Resume_str']
print(first_resume_str)

         HR ADMINISTRATOR/MARKETING ASSOCIATE

HR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strategies, inventory control, loss preve

In [21]:
hr_resumes = cv[cv['Category'] == 'HR']

# Display the first few rows of HR resumes
print(hr_resumes.head())

         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [22]:
# Number of HR resumes
num_hr_resumes = hr_resumes.shape[0]

# Overview of HR resumes data
print(f"Number of HR resumes: {num_hr_resumes}")
print(hr_resumes.info())

Number of HR resumes: 110
<class 'pandas.core.frame.DataFrame'>
Index: 110 entries, 0 to 109
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           110 non-null    int64 
 1   Resume_str   110 non-null    object
 2   Resume_html  110 non-null    object
 3   Category     110 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.3+ KB
None


In [23]:
# Combine all HR resumes into one large text
all_text = ' '.join(hr_resumes['Resume_str'].astype(str))

# Tokenize the text and filter out common stop words
words = re.findall(r'\b\w+\b', all_text.lower())
stop_words = set(["and", "the", "with", "to", "a", "of", "in", "for", "on", "at", "by", "is", "as", "from", "that", "or"])
filtered_words = [word for word in words if word not in stop_words]

# Get the most common words
common_words = Counter(filtered_words).most_common(20)
print(f"Most common words in HR resumes: {common_words}")

Most common words in HR resumes: [('hr', 1056), ('management', 949), ('employee', 828), ('company', 756), ('state', 666), ('human', 599), ('city', 568), ('all', 538), ('training', 502), ('employees', 500), ('new', 457), ('name', 455), ('resources', 416), ('business', 390), ('benefits', 388), ('skills', 376), ('development', 344), ('payroll', 333), ('performance', 317), ('process', 314)]


In [24]:
# Define HR-related keywords
hr_keywords = ['recruitment', 'employee relations', 'HRIS', 'benefits', 'training', 'compensation', 'labor relations', 'performance management']

# Filter resumes that contain HR keywords
relevant_hr_resumes = hr_resumes[hr_resumes['Resume_str'].str.contains('|'.join(hr_keywords), case=False)]

# Display the first few relevant HR resumes
print(relevant_hr_resumes.head())

         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [25]:
# Sample HR job description
job_description = """
    We are looking for an experienced HR professional with a strong background in recruitment, employee relations,
    and HRIS. The ideal candidate should have excellent communication skills and experience in benefits management and training.
"""

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(relevant_hr_resumes['Resume_str'])

# Transform the job description to a vector
job_vector = vectorizer.transform([job_description])

# Calculate cosine similarity
similarities = cosine_similarity(job_vector, tfidf_matrix).flatten()

# Add similarity scores to the DataFrame
relevant_hr_resumes['Similarity'] = similarities

# Rank resumes by similarity score
ranked_resumes = relevant_hr_resumes.sort_values(by='Similarity', ascending=False)

# Display the top ranked resumes
print(ranked_resumes.head())

           ID                                         Resume_str  \
53   13520837           HR REPRESENTATIVE       Summary    A ...   
27   29297973           HR REPRESENTATIVE       Summary    Ex...   
31   18316239           HR GENERALIST       Professional Prof...   
105  30563572           HR MANAGER/BUSINESS PARTNER          ...   
90   17150707           HR SHARED SERVICES ANALYST           ...   

                                           Resume_html Category  Similarity  
53   <div class="fontsize fontface vmargins hmargin...       HR    0.229908  
27   <div class="fontsize fontface vmargins hmargin...       HR    0.169738  
31   <div class="fontsize fontface vmargins hmargin...       HR    0.163598  
105  <div class="fontsize fontface vmargins hmargin...       HR    0.160427  
90   <div class="fontsize fontface vmargins hmargin...       HR    0.160191  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_hr_resumes['Similarity'] = similarities


ICT

In [27]:
# Directory path
directory_path = 'data/data/ICT'

# List all files in the directory
files = os.listdir(directory_path)

# Count all files
total_files = len(files)

print(f"Total files in '{directory_path}': {total_files}")

Total files in 'data/data/ICT': 68


In [28]:
# Filter resumes with the 'INFORMATION-TECHNOLOGY' category
it_resumes = cv[cv['Category'] == 'ICT']

# Display the first few rows of IT resumes
print(it_resumes.head())

           ID                                         Resume_str  \
110  36856210           INFORMATION TECHNOLOGY         Summar...   
111  21780877           INFORMATION TECHNOLOGY SPECIALIST\tGS...   
112  33241454           INFORMATION TECHNOLOGY SUPERVISOR    ...   
113  25990239           INFORMATION TECHNOLOGY INSTRUCTOR    ...   
114  16899268           INFORMATION TECHNOLOGY MANAGER/ANALYS...   

                                           Resume_html Category  
110  <div class="fontsize fontface vmargins hmargin...      ICT  
111  <div class="fontsize fontface vmargins hmargin...      ICT  
112  <div class="fontsize fontface vmargins hmargin...      ICT  
113  <div class="fontsize fontface vmargins hmargin...      ICT  
114  <div class="fontsize fontface vmargins hmargin...      ICT  


In [29]:
# Define IT-related keywords from the job description
it_keywords = [
    'monitoring', 'network', 'server', 'storage', 'troubleshoot', 'cloud', 
    'user support', 'security', 'automation', 'documentation', 'vendor management',
    'infrastructure', 'virtualization', 'IT infrastructure', 'CompTIA', 'Microsoft'
]

# Filter resumes by these keywords
relevant_it_resumes = it_resumes[it_resumes['Resume_str'].str.contains('|'.join(it_keywords), case=False)]

# Display the filtered IT resumes
print(relevant_it_resumes.head())

           ID                                         Resume_str  \
110  36856210           INFORMATION TECHNOLOGY         Summar...   
111  21780877           INFORMATION TECHNOLOGY SPECIALIST\tGS...   
112  33241454           INFORMATION TECHNOLOGY SUPERVISOR    ...   
113  25990239           INFORMATION TECHNOLOGY INSTRUCTOR    ...   
114  16899268           INFORMATION TECHNOLOGY MANAGER/ANALYS...   

                                           Resume_html Category  
110  <div class="fontsize fontface vmargins hmargin...      ICT  
111  <div class="fontsize fontface vmargins hmargin...      ICT  
112  <div class="fontsize fontface vmargins hmargin...      ICT  
113  <div class="fontsize fontface vmargins hmargin...      ICT  
114  <div class="fontsize fontface vmargins hmargin...      ICT  


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine job description into a single string
job_description = """
    IT Support Specialist Job at Dataposit. Responsibilities include monitoring IT systems, troubleshooting issues, 
    implementing upgrades, providing user support, maintaining security, automating tasks, and managing vendors. 
    Qualifications include IT infrastructure support experience, networking, servers, storage, cloud platforms, 
    relevant certifications, and excellent communication skills.
"""

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer on the resumes and transform both the resumes and job description
tfidf_matrix = vectorizer.fit_transform(relevant_it_resumes['Resume_str'])
job_vector = vectorizer.transform([job_description])

# Calculate cosine similarity between job description and each resume
similarities = cosine_similarity(job_vector, tfidf_matrix).flatten()

# Add similarity scores to the DataFrame
relevant_it_resumes['Similarity'] = similarities

# Rank resumes by similarity score
ranked_it_resumes = relevant_it_resumes.sort_values(by='Similarity', ascending=False)

# Display top-ranked resumes
print(ranked_it_resumes.head())

           ID                                         Resume_str  \
125  24913648           INFORMATION TECHNOLOGY SPECIALIST    ...   
135  25959103           ADMINISTRATOR OF INFORMATION TECHNOLO...   
146  22450718           INFORMATION TECHNOLOGY SPECIALIST    ...   
175  10089434           INFORMATION TECHNOLOGY TECHNICIAN I  ...   
171  57002858           INFORMATION TECHNOLOGY MANAGER       ...   

                                           Resume_html Category  Similarity  
125  <div class="fontsize fontface vmargins hmargin...      ICT    0.173599  
135  <div class="fontsize fontface vmargins hmargin...      ICT    0.171244  
146  <div class="fontsize fontface vmargins hmargin...      ICT    0.142471  
175  <div class="fontsize fontface vmargins hmargin...      ICT    0.133627  
171  <div class="fontsize fontface vmargins hmargin...      ICT    0.132806  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_it_resumes['Similarity'] = similarities


In [32]:
# Directory containing PDF resumes
directory = 'data/data/ICT'

# List to store resumes
pdf_resumes = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.pdf'):
        filepath = os.path.join(directory, filename)
        
        # Open the PDF file
        with open(filepath, 'rb') as file:
            reader = PdfReader(file)
            text = ""
            
            # Extract text from each page
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text() or ""
                
            # Append the extracted text to the list
            pdf_resumes.append((filename, text))

# Display the filenames and the first 500 characters of the first resume as a sample
for resume in pdf_resumes[:1]:
    print(f"Filename: {resume[0]}")
    print(f"Content: {resume[1][:500]}")

Filename: 10247517.pdf
Content: INFORMATION TECHNOLOGY MANAGER
Professional Summary
Possesses an extensive background in Information Technology Management, along with a Masters of Science degree and multiple certifications.
Excels in planning, implementing, and evaluating the systems, infrastructure, and staffing necessary to execute complex initiatives and meet
deadlines in dynamic, fast-paced environments; adept at overseeing and participating in the installation, configuration, maintenance, and upgrade of
networks, hardware


In [33]:
# Keywords to look for in the resumes
keywords = [
    'monitoring', 'network', 'server', 'storage', 'troubleshoot', 'cloud', 
    'user support', 'security', 'automation', 'documentation', 'vendor management',
    'infrastructure', 'virtualization', 'IT infrastructure', 'CompTIA', 'Microsoft'
]

# Function to count keyword occurrences
def count_keywords(text, keywords):
    keyword_count = {key: len(re.findall(rf'\b{key}\b', text, re.IGNORECASE)) for key in keywords}
    return sum(keyword_count.values())

# Analyze each resume for keyword occurrence
resume_analysis = [(filename, count_keywords(text, keywords)) for filename, text in pdf_resumes]

# Sort resumes by keyword count in descending order
sorted_resumes = sorted(resume_analysis, key=lambda x: x[1], reverse=True)

# Display the top resumes
for resume in sorted_resumes[:5]:
    print(f"Filename: {resume[0]}, Keyword Matches: {resume[1]}")

Filename: 20824105.pdf, Keyword Matches: 73
Filename: 31243710.pdf, Keyword Matches: 66
Filename: 24913648.pdf, Keyword Matches: 47
Filename: 15651486.pdf, Keyword Matches: 44
Filename: 18301617.pdf, Keyword Matches: 44


In [34]:
# Job description text
job_description = """
    IT Support Specialist Job at Dataposit. Responsibilities include monitoring IT systems, troubleshooting issues, 
    implementing upgrades, providing user support, maintaining security, automating tasks, and managing vendors. 
    Qualifications include IT infrastructure support experience, networking, servers, storage, cloud platforms, 
    relevant certifications, and excellent communication skills.
"""

# Combine resume texts into a list
resume_texts = [text for _, text in pdf_resumes]

# Vectorize the texts
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(resume_texts)
job_vector = vectorizer.transform([job_description])

# Calculate cosine similarity
similarities = cosine_similarity(job_vector, tfidf_matrix).flatten()

# Combine filenames with similarity scores
resume_similarity = [(filename, similarity) for filename, similarity in zip([filename for filename, _ in pdf_resumes], similarities)]

# Sort by similarity
ranked_resumes = sorted(resume_similarity, key=lambda x: x[1], reverse=True)

# Display the top ranked resumes
for resume in ranked_resumes[:5]:
    print(f"Filename: {resume[0]}, Similarity Score: {resume[1]}")


Filename: 24913648.pdf, Similarity Score: 0.17799488076362488
Filename: 25959103.pdf, Similarity Score: 0.17582032257658103
Filename: 57002858.pdf, Similarity Score: 0.1347462920681473
Filename: 16533554.pdf, Similarity Score: 0.13398807964885043
Filename: 26480367.pdf, Similarity Score: 0.13241146831504744
