In [29]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import spacy
import re
import nltk

# Build and train models
import keras_tuner as kt
from transformers import BertTokenizer, TFBertForSequenceClassification
import faiss
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





In [12]:
# Load resume data
resume_df = pd.read_csv('Resume.csv')
resume_df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [13]:
# Load JD data
job_desc_df = pd.read_csv('job_description.csv')
job_desc_df.head()

Unnamed: 0.1,Unnamed: 0,Category,Description,Benefits,Requirement,Requirements
0,0,Business Analyst,Description\nJob Title: Junior Business Analys...,Benefits\nMethods is passionate about its peop...,Requirements\n• Confidence in communicating an...,
1,1,Business Analyst,Description\nThe Business Analyst (BA) will ca...,Benefits\nA competitive base salary\nBonus sch...,Requirements\nYou have…\nA methodical approach...,
2,2,Business Analyst,Description\nWe help our clients design and de...,Benefits\nWe have 2 promotion windows open eac...,"Requirements\nYou are open, curious, and excit...",
3,3,Business Analyst,Description\nFounded in 2015 and rapidly expan...,Benefits\nIn addition to a competitive package...,Requirements\nKey Responsibilities Include:\nP...,
4,4,Business Analyst,Description\nWe are seeking a skilled Business...,Benefits\nFood Allowance\nGovernment Benefits\...,Requirements\nBachelor's Degree in management ...,


## Data exploration and pre processing

In [14]:
# Check for unique values in the 'category' column
resume_df['Category'].unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [15]:
job_desc_df['Category'].unique()

array(['Business Analyst', 'Cloud', 'Data Scientist', 'HR',
       'Software Developer', 'UI/UX'], dtype=object)

In [16]:
#replace the column with IT
resume_df['Category'].replace({'INFORMATION-TECHNOLOGY': 'IT'}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  resume_df['Category'].replace({'INFORMATION-TECHNOLOGY': 'IT'}, inplace=True)


In [18]:
# Filter the data to include the HR and IT categories only
IT_resume = resume_df[resume_df['Category'] == 'HR']
HR_resume = resume_df[resume_df['Category'] == 'IT']

# Print the length of the HR and IT jobs
print(f"Found {len(HR_resume)} HR resumes")
print(f"Found {len(IT_resume)} IT resumes")

Found 120 HR resumes
Found 110 IT resumes


In [None]:
# Replace the Category column values with IT in the job description dataframe
job_desc_df['Category'].replace({'Cloud': 'IT', 'Data Scientist': 'IT', 'Software Developer': 'IT','UI/UX': 'IT'}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  job_desc_df['Category'].replace({'Cloud': 'IT', 'Data Scientist': 'IT', 'Software Developer': 'IT','UI/UX': 'IT'}, inplace=True)


In [20]:
# Merge the requirement and requirements columns into a single column
job_desc_df['All_Requirements'] = job_desc_df['Requirement'].fillna('') + ' ' + job_desc_df['Requirements'].fillna('')

# Drop the original columns
job_desc_df.drop(columns=['Requirement', 'Requirements'], inplace=True)

In [22]:
# Combine all relevant job description fields into one string
job_desc_df['job_txt'] = (job_desc_df[['Description', 'All_Requirements']].fillna('').agg(' '.join, axis=1))

# Drop unnecessary columns for clarity
job_desc_df = job_desc_df[['Category', 'job_txt']]

# Display preview
job_desc_df.head()

Unnamed: 0,Category,job_txt
0,Business Analyst,Description\nJob Title: Junior Business Analys...
1,Business Analyst,Description\nThe Business Analyst (BA) will ca...
2,Business Analyst,Description\nWe help our clients design and de...
3,Business Analyst,Description\nFounded in 2015 and rapidly expan...
4,Business Analyst,Description\nWe are seeking a skilled Business...


In [23]:
# Filter HR and IT job descriptions
HR_jobs = job_desc_df[job_desc_df['Category'] == 'HR']
IT_jobs = job_desc_df[job_desc_df['Category'] == 'IT']


# Print the length of the HR and IT jobs
print(f"Found {len(HR_jobs)} HR job descriptions")
print(f"Found {len(IT_jobs)} IT job descriptions")

Found 32 HR job descriptions
Found 239 IT job descriptions


In [27]:
# combine the filtered resume dataframes into one
filtered_resume = pd.concat([HR_resume, IT_resume], ignore_index=True)
# combine the filtered job description dataframes into one
filtered_job_desc = pd.concat([HR_jobs, IT_jobs], ignore_index=True)


## Extract information from the filtered resume dataset

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define keywords for fallback extraction (the keywords are used to extract the education, skills and certification from the resume)
education_keywords = ['bachelor', 'master', 'phd', 'b.sc', 'm.sc', 'mba', 'btech', 'mtech', 'degree']
certification_keywords = ['certified', 'certification', 'certificate', 'cfa', 'pmp', 'aws', 'scrum']
skill_keywords = [
    'python', 'sql', 'excel', 'machine learning', 'deep learning','project management', 'data analysis', 
    'c++', 'java', 'javascript','html', 'css', 'docker', 'kubernetes', 'cloud computing', 
    'lawson', 'paychex', 'kronox', 'adp', 'ms office', 'microsoft office', 'hr', 'human resources', 
    'recruitment', 'talent acquisition', 'employee relations', 'performance management', 'word', 'access',
    'publisher', 'outlook', 'powerpoint', 'sharepoint', 'quickbooks', 'payroll', 'benefits',
    'compliance', 'employee engagement', 'training and development', 'organizational development'   
]

def extract_features(text):
    doc = nlp(text)

    # Containers
    education = set()
    certifications = set()
    skills = set()
    experience_years = 0

    # Named Entity Recognition
    for ent in doc.ents:
        ent_text = ent.text.lower()

        # Check for education-related content
        if ent.label_ in ['ORG', 'EDUCATION', 'WORK_OF_ART']: 
            if any(kw in ent_text for kw in education_keywords):
                education.add(ent.text)

        # Check for certifications
        if any(cert in ent_text for cert in certification_keywords):
            certifications.add(ent.text)

        # Check for experience in years
        match = re.search(r'(\d{1,2})\+?\s?(years?|yrs?)', ent_text)
        if match:
            experience_years = max(experience_years, int(match.group(1)))

    # Check for skills based on keywords
    lower_text = text.lower()
    for skill in skill_keywords:
        if skill in lower_text:
            skills.add(skill)

    return pd.Series({
        'Education': ' | '.join(education),
        'Certifications': ' | '.join(certifications),
        'Experience (Years)': experience_years,
        'Skills': ', '.join(skills)
    })

# Apply to your dataframe
features_df = filtered_resume['Resume_str'].apply(extract_features)
final_df = pd.concat([filtered_resume, features_df], axis=1)

# Preview
final_df[['ID', 'Education', 'Skills', 'Experience (Years)', 'Certifications']].head()

Unnamed: 0,ID,Education,Skills,Experience (Years),Certifications
0,36856210,,"compliance, excel, hr, html",37,
1,21780877,,"hr, access, project management",0,
2,33241454,,"hr, project management, sharepoint, access, excel",5,Education Certification | Certificate of ...
3,25990239,,"hr, project management, outlook, powerpoint, h...",0,
4,16899268,,"data analysis, hr, project management",0,Certified Financial Planner | State Marketi...


## Modelling with S-BERT

In [30]:
# Load the pretrained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed and normalize the datasets
def embed_and_normalize(texts):
    embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings

# Embed and normalize the resumes
hr_resume_embedds = embed_and_normalize(HR_resume['Resume_str'].tolist())
it_resume_embedds = embed_and_normalize(IT_resume['Resume_str'].tolist())

# Embed and normalize the job descriptions
hr_job_embedds = embed_and_normalize(HR_jobs['job_txt'].tolist())
it_job_embedds = embed_and_normalize(IT_jobs['job_txt'].tolist())

Batches: 100%|██████████| 4/4 [00:15<00:00,  3.82s/it]
Batches: 100%|██████████| 4/4 [00:08<00:00,  2.13s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
Batches: 100%|██████████| 8/8 [00:32<00:00,  4.12s/it]
