In [1]:
import os
import PyPDF2
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join(page.extract_text() or "" for page in reader.pages)
    return text

In [4]:
def extract_sections(text):
    sections = {
        "skills": "",
        "experience": "",
        "projects": "",
        "other": ""
    }
    current = "other"
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        l = line.lower()
        if "skill" in l:
            current = "skills"
        elif "experience" in l:
            current = "experience"
        elif "project" in l:
            current = "projects"
        sections[current] += line + " "
    return sections


In [14]:
def weighted_score(resume_sections, jd_text):
    jd_emb = model.encode(jd_text, convert_to_tensor=True)
    weights = {
        "skills": 0.1,
        "experience": 0.6,
        "projects": 0.3
    }

    score = 0.0
    for sec, text in resume_sections.items():
        if text.strip() == "":
            continue
        emb = model.encode(text, convert_to_tensor=True)
        sim = util.pytorch_cos_sim(emb, jd_emb).item()
        score += weights.get(sec, 0.0) * sim
    return round(score * 100, 2)

In [15]:
resume_dir = "resumes/"
# jd_path = "JDs/AI_intern_Armada.txt"
jd_path = "JDs/Data_engineering_intern_LiveRamp.txt"

with open(jd_path, 'r', encoding='utf-8') as f:
    jd_text = f.read()

results = []



In [16]:
for file in os.listdir(resume_dir):
    if not file.endswith(".pdf"):
        continue
    fpath = os.path.join(resume_dir, file)
    text = extract_text_from_pdf(fpath)
    sections = extract_sections(text)
    score = weighted_score(sections, jd_text)
    results.append({"Resume": file, "Weighted Score": score})

results_df = pd.DataFrame(results).sort_values("Weighted Score", ascending=False)
results_df.reset_index(drop=True, inplace=True)

In [17]:
## latest
results_df

Unnamed: 0,Resume,Weighted Score
0,Dhruvraj_resume_May18.pdf,42.59
1,Dhruvraj_resume_MSDS.pdf,42.23
2,Dhruvraj_resume_USHunger.pdf,38.99
3,Dhruvraj_resume_Mar11.pdf,38.17
4,Dhruvraj_Resume_intern_rocket.pdf,38.17
5,Dhruvraj_resume_image_analytics.pdf,8.08
