In [1]:
import os
import pdfplumber
import re


In [2]:
RESUME_FOLDER = ".."



In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


In [4]:
resumes = {}

for file in os.listdir(RESUME_FOLDER):
    if file.endswith(".pdf"):
        file_path = os.path.join(RESUME_FOLDER, file)
        resumes[file] = extract_text_from_pdf(file_path)

print("Total resumes loaded:", len(resumes))



Total resumes loaded: 5


In [5]:
for filename, text in resumes.items():
    print("="*80)
    print("RESUME:", filename)
    print("="*80)
    print(text[:1500])  # print first 1500 characters


RESUME: data_analyst_resume.pdf
Rahul Sharma
Email: rahul.sharma@email.com
Skills: Python, SQL, Power BI, Data Analysis, Statistics
Experience
Data Analyst Intern – Performed EDA, dashboards, reporting
Education
B.Sc Data Science

RESUME: ml_engineer_resume.pdf
Ananya Patel
Email: ananya.patel@email.com
Skills: Python, Machine Learning, NLP, TensorFlow, Streamlit
Experience
ML Engineer Intern – Built ML models and deployed apps
Education
B.Tech Computer Science

RESUME: power_bi_analyst_resume.pdf
Vikram Desai
Email: vikram.desai@email.com
Skills: Power BI, DAX, SQL, Excel, Data Visualization
Experience
BI Analyst – Created dashboards and business reports
Education
MBA Business Analytics

RESUME: sample_pdf.pdf
JACQUELINE THOMPSON
123 Anywhere St., Any City • 123-456-7890 • hello@reallygreatsite.com
www.reallygreatsite.com
SUMMARY
Results-oriented Engineering Executive with a proven track record of optimizing project outcomes.
Skilled in strategic project management and team leadership

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [7]:
cleaned_resumes = {}

for filename, text in resumes.items():
    cleaned_resumes[filename] = clean_text(text)

for filename, text in cleaned_resumes.items():
    print("="*80)
    print("CLEANED RESUME:", filename)
    print("="*80)
    print(text[:1000])


CLEANED RESUME: data_analyst_resume.pdf
rahul sharma email rahul sharma email com skills python sql power bi data analysis statistics experience data analyst intern performed eda dashboards reporting education b sc data science
CLEANED RESUME: ml_engineer_resume.pdf
ananya patel email ananya patel email com skills python machine learning nlp tensorflow streamlit experience ml engineer intern built ml models and deployed apps education b tech computer science
CLEANED RESUME: power_bi_analyst_resume.pdf
vikram desai email vikram desai email com skills power bi dax sql excel data visualization experience bi analyst created dashboards and business reports education mba business analytics
CLEANED RESUME: sample_pdf.pdf
jacqueline thompson 123 anywhere st any city 123 456 7890 hello reallygreatsite com www reallygreatsite com summary results oriented engineering executive with a proven track record of optimizing project outcomes skilled in strategic project management and team leadership see

In [8]:
job_description = """
We are looking for a Data Analyst with strong skills in Python, SQL, data analysis,
machine learning basics, Power BI, and data visualization.
Experience with NLP, Streamlit, and GitHub is a plus.
"""


In [9]:
resume_texts = list(cleaned_resumes.values())
resume_names = list(cleaned_resumes.keys())


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=500
)

all_texts = resume_texts + [job_description.lower()]
tfidf_matrix = vectorizer.fit_transform(all_texts)


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

jd_vector = tfidf_matrix[-1]
resume_vectors = tfidf_matrix[:-1]

similarity_scores = cosine_similarity(resume_vectors, jd_vector)


In [12]:
import pandas as pd

results = pd.DataFrame({
    "Resume": resume_names,
    "Match Score (%)": (similarity_scores.flatten() * 100).round(2)
})

results = results.sort_values(by="Match Score (%)", ascending=False)
results


Unnamed: 0,Resume,Match Score (%)
4,sample_resume.pdf,45.81
0,data_analyst_resume.pdf,34.06
2,power_bi_analyst_resume.pdf,21.81
1,ml_engineer_resume.pdf,15.46
3,sample_pdf.pdf,0.83


Keyword-Level Explainability (WHY a resume matched)

In [13]:
import numpy as np

feature_names = np.array(vectorizer.get_feature_names_out())

def get_top_keywords(resume_vector, jd_vector, top_n=10):
    scores = (resume_vector.multiply(jd_vector)).toarray()[0]
    top_indices = scores.argsort()[-top_n:][::-1]
    return feature_names[top_indices]

for i, name in enumerate(resume_names):
    print(f"\nTop keywords for {name}:")
    print(get_top_keywords(resume_vectors[i], jd_vector))



Top keywords for data_analyst_resume.pdf:
['data' 'python' 'power' 'analysis' 'sql' 'bi' 'analyst' 'skills'
 'experience' 'track']

Top keywords for ml_engineer_resume.pdf:
['streamlit' 'nlp' 'machine' 'learning' 'python' 'skills' 'experience'
 'visualization' 'xxxxxxxxxx' 'track']

Top keywords for power_bi_analyst_resume.pdf:
['data' 'bi' 'visualization' 'power' 'sql' 'analyst' 'experience' 'skills'
 'xxxxxxxxxx' 'track']

Top keywords for sample_pdf.pdf:
['analysis' 'experience' 'skills' 'www' 'workflows' 'visualization'
 'vikram' 'worked' 'work' 'track']

Top keywords for sample_resume.pdf:
['data' 'machine' 'learning' 'streamlit' 'nlp' 'analysis' 'github'
 'experience' 'power' 'analyst']


In [14]:
#Save Vectorizer & Resume Representations
import joblib


joblib.dump(vectorizer, "../tfidf_vectorizer.joblib")
joblib.dump(results, "resume_match_results.joblib")


['resume_match_results.joblib']