# Cleaning Resume into cleaned_Resume.csv

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
resume_data = pd.read_csv('/Users/danielwang/EpicHire/Data-Science-Team/data/Resume.csv')
print("Original Resume Data:")
print(resume_data.head())

Original Resume Data:
         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [11]:
# Clean the resume dataset to only include the Resume_str
resume_data = resume_data[['ID', 'Resume_str', 'Category']]
print("Cleaned Resume Data:")
print(resume_data.head())

Cleaned Resume Data:
         ID                                         Resume_str Category
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...       HR
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...       HR
2  33176873           HR DIRECTOR       Summary      Over 2...       HR
3  27018550           HR SPECIALIST       Summary    Dedica...       HR
4  17812897           HR MANAGER         Skill Highlights  ...       HR


In [12]:
# Cleaned Resume
filtered_resume_path = '/Users/danielwang/EpicHire/Data-Science-Team/data/cleaned_Resume.csv' 
resume_data.to_csv(filtered_resume_path, index=False)
print(f"Filtered resume data saved to {filtered_resume_path}")

Filtered resume data saved to /Users/danielwang/EpicHire/Data-Science-Team/data/cleaned_Resume.csv


# Begin Matching datasets

In [10]:
# Job dataset loaded
job_data = pd.read_csv('/Users/danielwang/EpicHire/Data-Science-Team/data/Technology_job_postings.csv')
print("Job Descriptions Data:")
print(job_data.head())

Job Descriptions Data:
             Job Id                           Company  \
0  1089843540111562                 Icahn Enterprises   
1   398454096642776      PNC Financial Services Group   
2   481640072963533  United Services Automobile Assn.   
3   688192671473044                              Hess   
4   117057806156508                      Cairn Energy   

                      Job Title Job Description Qualifications     Experience  \
0  Digital Marketing Specialist             NaN         M.Tech  5 to 15 Years   
1                 Web Developer             NaN            BCA  2 to 12 Years   
2            Operations Manager             NaN            PhD  0 to 12 Years   
3              Network Engineer             NaN            PhD  4 to 11 Years   
4                 Event Manager             NaN            MBA  1 to 12 Years   

                        Role  \
0       Social Media Manager   
1     Frontend Web Developer   
2    Quality Control Manager   
3  Wireless Network

In [21]:
import re

# keywords = [
#     'developer', 'engineer', 'data', 'software', 'it', 'analyst', 'system',
#     'technician', 'programmer', 'social media', 'content', 'marketing',
#     'network', 'database', 'design', 'management', 'security'
# ]


# def preprocess_text(text):
#     words = str(text).lower().split()
#     return " ".join([word for word in words if word in keywords])

def extract_keywords_from_description(description):
    """
    Extracts keywords from the job description text by filtering out common stop words
    and punctuation, leaving unique, meaningful words.
    """
    # Basic text cleaning
    words = re.findall(r'\b\w+\b', str(description).lower())  # Find all word tokens and convert to lowercase
    keywords = set(words)  # Use a set to store unique words
    return " ".join(keywords)  # Convert back to a space-separated string

# Apply the function to extract keywords for each job in the job data
job_data['Job_Keywords'] = job_data['Job Description'].apply(extract_keywords_from_description)
print("Keywords extracted for each job description:")
print(job_data[['Job Title', 'Job_Keywords']].head())  # Display the first few rows for verification



Keywords extracted for each job description:
                      Job Title Job_Keywords
0  Digital Marketing Specialist          nan
1                 Web Developer          nan
2            Operations Manager          nan
3              Network Engineer          nan
4                 Event Manager          nan


In [12]:
job_data['Job_Keywords'] = job_data['Job Description'].apply(preprocess_text)
resume_data['Resume_Keywords'] = resume_data['Resume_str'].apply(preprocess_text)

# Check results
print("Processed Job Descriptions Data:")
print(job_data[['Job Id', 'Job Title', 'Job_Keywords']].head())

print("Processed Resume Data:")
print(resume_data[['ID', 'Resume_Keywords']].head())

Processed Job Descriptions Data:
             Job Id                     Job Title Job_Keywords
0  1089843540111562  Digital Marketing Specialist             
1   398454096642776                 Web Developer             
2   481640072963533            Operations Manager             
3   688192671473044              Network Engineer             
4   117057806156508                 Event Manager             
Processed Resume Data:
         ID                                    Resume_Keywords
0  16852973  management marketing marketing system manageme...
1  22323967  system marketing content design software manag...
2  33176873  management security security database manageme...
3  27018550  management management management management da...
4  17812897  management management data management manageme...


# Dont run cell below code it is old

In [18]:
# Matching function to compare resumes with job descriptions
def match_resumes_to_jobs(job_data, resume_data):
    matches = []

    # Vectorize the keywords to calculate similarity
    vectorizer = TfidfVectorizer()
    job_tfidf = vectorizer.fit_transform(job_data['Job_Keywords'])
    resume_tfidf = vectorizer.transform(resume_data['Resume_Keywords'])

    # Calculate cosine similarity between each resume and job description
    similarity_matrix = cosine_similarity(resume_tfidf, job_tfidf)

    # Get the keywords in the vocabulary to help identify matches
    vocab = vectorizer.get_feature_names_out()

    # Identify the best match for each resume
    for i, applicant_id in enumerate(resume_data['ID']):
        best_match_index = similarity_matrix[i].argmax()
        best_match_score = similarity_matrix[i][best_match_index]
        
        # Find matching keywords between resume and job description
        resume_keywords = set(resume_data.loc[i, 'Resume_Keywords'].split())
        job_keywords = set(job_data.loc[best_match_index, 'Job_Keywords'].split())
        matching_keywords = resume_keywords.intersection(job_keywords)
        
        # Convert matching keywords to a comma-separated string
        matching_keywords_str = ", ".join(matching_keywords)

        # Store the match information
        match_info = {
            'Applicant Id': applicant_id,
            'Matched Job Id': job_data.loc[best_match_index, 'Job Id'],
            'Matched Job Title': job_data.loc[best_match_index, 'Job Title'],
            'Match Score': best_match_score,
            'Matching Keywords': matching_keywords_str  # Add matching keywords
        }
        matches.append(match_info)

    # Return the results as a DataFrame
    return pd.DataFrame(matches)

# Run cell below instead

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def match_resumes_to_jobs(job_data, resume_data):
    matches = []
    vectorizer = TfidfVectorizer()

    # Transform job and resume keywords
    job_tfidf = vectorizer.fit_transform(job_data['Job_Keywords'])
    resume_tfidf = vectorizer.transform(resume_data['Resume_str'])  # Assuming resume text is in 'Resume_str'

    # Calculate cosine similarity between each resume and job description
    similarity_matrix = cosine_similarity(resume_tfidf, job_tfidf)
    vocab = vectorizer.get_feature_names_out()  # Get feature names for matching

    # Find the best match for each resume
    for i, applicant_id in enumerate(resume_data['ID']):
        best_match_index = similarity_matrix[i].argmax()
        best_match_score = similarity_matrix[i][best_match_index]
        
        # Get the specific matching keywords by comparing transformed vectors
        resume_keywords = set(resume_data.loc[i, 'Resume_str'].lower().split())
        job_keywords = set(job_data.loc[best_match_index, 'Job_Keywords'].split())
        matching_keywords = resume_keywords.intersection(job_keywords)
        
        matching_keywords_str = ", ".join(matching_keywords)

        # Store the match information
        match_info = {
            'Applicant Id': applicant_id,
            'Matched Job Id': job_data.loc[best_match_index, 'Job Id'],
            'Matched Job Title': job_data.loc[best_match_index, 'Job Title'],
            'Match Score': best_match_score,
            'Matching Keywords': matching_keywords_str  # Display the matching keywords
        }
        matches.append(match_info)

    # Return the results as a DataFrame
    return pd.DataFrame(matches)

In [23]:
# Running matching and show results
matches_df = match_resumes_to_jobs(job_data, resume_data)
print("Resume-Job Matches:")
print(matches_df.head())

Resume-Job Matches:
   Applicant Id    Matched Job Id                Matched Job Title  \
0      16852973   407980927519454                    HR Generalist   
1      22323967  1343787400703296                       Copywriter   
2      33176873   407980927519454                    HR Generalist   
3      27018550  2122997813566133  Customer Service Representative   
4      17812897   407980927519454                    HR Generalist   

   Match Score                                  Matching Keywords  
0     0.215434  training, programs, support, in, and, the, emp...  
1     0.230886  materials, brand, campaigns, creative, and, co...  
2     0.262458  training, support, provide, in, and, the, empl...  
3     0.224830  calls, a, over, call, and, experience, custome...  
4     0.332669  in, and, such, hr, training, to, various, prov...  


In [24]:
# Saving matches to a csv file
matches_df.to_csv('/Users/danielwang/EpicHire/Data-Science-Team/data/resume_job_matches.csv', index=False)
print("Matches saved to resume_job_matches.csv")

Matches saved to resume_job_matches.csv
