In [1]:
import os
import PyPDF2
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join(page.extract_text() or "" for page in reader.pages)
    return text

In [4]:
def extract_sections(text):
    sections = {
        "skills": "",
        "experience": "",
        "projects": "",
        "other": ""
    }
    current = "other"
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        l = line.lower()
        if "skill" in l:
            current = "skills"
        elif "experience" in l:
            current = "experience"
        elif "project" in l:
            current = "projects"
        sections[current] += line + " "
    return sections


In [14]:
def weighted_score(resume_sections, jd_text):
    jd_emb = model.encode(jd_text, convert_to_tensor=True)
    weights = {
        "skills": 0.1,
        "experience": 0.6,
        "projects": 0.3
    }

    score = 0.0
    for sec, text in resume_sections.items():
        if text.strip() == "":
            continue
        emb = model.encode(text, convert_to_tensor=True)
        sim = util.pytorch_cos_sim(emb, jd_emb).item()
        score += weights.get(sec, 0.0) * sim
    return round(score * 100, 2)

In [15]:
resume_dir = "resumes/"
# jd_path = "JDs/AI_intern_Armada.txt"
jd_path = "JDs/Data_engineering_intern_LiveRamp.txt"

with open(jd_path, 'r', encoding='utf-8') as f:
    jd_text = f.read()

results = []



In [16]:
for file in os.listdir(resume_dir):
    if not file.endswith(".pdf"):
        continue
    fpath = os.path.join(resume_dir, file)
    text = extract_text_from_pdf(fpath)
    sections = extract_sections(text)
    score = weighted_score(sections, jd_text)
    results.append({"Resume": file, "Weighted Score": score})

results_df = pd.DataFrame(results).sort_values("Weighted Score", ascending=False)
results_df.reset_index(drop=True, inplace=True)

In [17]:
## latest
results_df

Unnamed: 0,Resume,Weighted Score
0,Dhruvraj_resume_May18.pdf,42.59
1,Dhruvraj_resume_MSDS.pdf,42.23
2,Dhruvraj_resume_USHunger.pdf,38.99
3,Dhruvraj_resume_Mar11.pdf,38.17
4,Dhruvraj_Resume_intern_rocket.pdf,38.17
5,Dhruvraj_resume_image_analytics.pdf,8.08


In [29]:
import ollama
from math import ceil

def summarize_batch_with_ollama(jd_text, resume_batch):
    batch_prompts = ""
    for idx, resume_text in enumerate(resume_batch, start=1):
        batch_prompts += f"\nResume {idx}:\n\"\"\"{resume_text[:2000]}\"\"\"\n"

    prompt = f"""
You are an AI assistant helping recruiters evaluate candidates by comparing their resumes with a job description. For each resume, provide:

1. A 40-word summary of how the resume aligns with the JD.
2. A bullet list of 5 key relevant skills.
3. A one-line highlight of the candidate’s relevant experience.

Job Description:
\"\"\"{jd_text}\"\"\"

{batch_prompts}

Format output as:

Resume 1:
Summary: ...
Relevant Skills:
- ...
Main Highlight: ...

Resume 2:
...
"""

    try:
        response = ollama.chat(
            model='llama3.2',
            messages=[{"role": "user", "content": prompt}]
        )
        return response['message']['content'].strip()
    except Exception as e:
        return f"Error: {e}"

In [30]:
def summarize_resumes_ollama(jd_text, resume_texts, output_path="resume_summaries.txt"):
    batch_size = 3
    results = []

    for i in range(0, len(resume_texts), batch_size):
        batch = resume_texts[i:i+batch_size]
        result = summarize_batch_with_ollama(jd_text, batch)
        results.append(result)

    all_summary_text = "\n\n".join(results)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(all_summary_text)

    print(f"✅ Summaries saved to {output_path}")
    return all_summary_text


In [31]:
resume_paths = [
    "resumes/Dhruvraj_resume_May18.pdf",
    "resumes/Dhruvraj_resume_MSDS.pdf",
    "resumes/Dhruvraj_resume_USHunger.pdf",
    "resumes/Dhruvraj_Resume_intern_rocket.pdf"
]

resume_texts = [extract_text_from_pdf(path) for path in resume_paths]
jd_text = open("JDs/Data_engineering_intern_LiveRamp.txt", "r", encoding="utf-8").read()

summarize_resumes_ollama(jd_text, resume_texts, "summary_ollama_output.txt")


✅ Summaries saved to summary_ollama_output.txt


"Here is the evaluation of the three resumes against the job description:\n\n**Resume 1**\n\nSummary: The resume aligns with the JD as it showcases Dhruvraj's experience in data pipeline development, ETL processes, and analytics solutions. The candidate has worked on various projects that demonstrate their proficiency in Python, SQL, and big data tools.\n\nRelevant Skills:\n- Data pipeline development\n- ETL processes\n- Analytics solutions\n- Big data tools (PySpark, Hadoop, Kafka)\n- Cloud computing (AWS Suite)\n\nMain Highlight: Dhruvraj's experience as a Data Analyst at Draup Business Solutions Bangalore, where he designed and deployed high-performance ETL pipelines using PySpark and SQL on AWS EMR, improving data integrity by 35%.\n\n**Resume 2**\n\nSummary: The resume aligns with the JD as it highlights Dhruvraj's expertise in machine learning, deep learning, and big data. The candidate has worked on various projects that demonstrate their proficiency in machine learning framewor

## below code is with the cross encoder:

In [11]:
import os
import PyPDF2
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import re

# Load BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Section name mappings
SECTION_MAP = {
    "skills": ["skills", "technical skills", "skill set"],
    "experience": ["experience", "professional experience", "technical experience", "work experience"],
    "projects": ["projects", "academic projects", "personal projects"]
}

# Weights for section-wise scoring
SECTION_WEIGHTS = {
    "skills": 0.1,
    "experience": 0.6,
    "projects": 0.2,
    "other": 0.1
}

# Stopwords to filter out for keyword overlap
STOPWORDS = set([
    "a", "an", "the", "and", "or", "as", "is", "are", "was", "were", "to", "of", "in", "on", "for", "by", "with",
    "that", "this", "at", "from", "it", "be", "which", "but", "if", "they", "has", "have", "had", "will", "would",
    "can", "could", "their", "its", "about"
])

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = " ".join(page.extract_text() or "" for page in reader.pages)
    return text

def clean_line(line):
    return re.sub(r'[^a-zA-Z0-9\s]', '', line).strip().lower()

def extract_sections(text):
    sections = {key: "" for key in SECTION_WEIGHTS.keys()}
    current = "other"

    for line in text.splitlines():
        line_clean = clean_line(line)
        if not line_clean:
            continue
        for key, aliases in SECTION_MAP.items():
            if any(alias in line_clean for alias in aliases):
                current = key
                break
        sections[current] += line + " "
    return sections

def get_keyword_overlap_score(jd_text, resume_text):
    jd_keywords = set(w for w in re.findall(r'\w+', jd_text.lower()) if w not in STOPWORDS)
    resume_words = set(w for w in re.findall(r'\w+', resume_text.lower()) if w not in STOPWORDS)
    common = jd_keywords.intersection(resume_words)
    return round(len(common) / max(len(jd_keywords), 1), 4)  # normalized %

def compute_scores(text, sections, jd_text):
    jd_emb = bert_model.encode(jd_text, convert_to_tensor=True)
    bert_score = 0.0

    for sec, content in sections.items():
        if content.strip() == "":
            continue
        emb = bert_model.encode(content, convert_to_tensor=True)
        sim = util.pytorch_cos_sim(emb, jd_emb).item()
        bert_score += SECTION_WEIGHTS.get(sec, 0.0) * sim

    # Keyword Overlap
    overlap_score = get_keyword_overlap_score(jd_text, text)

    # Final Combined Score: 80% BERT + 20% Overlap
    final_score = 0.8 * bert_score + 0.2 * overlap_score

    return {
        "BERT Score": round(bert_score * 100, 2),
        "Keyword Overlap": round(overlap_score * 100, 2),
        "Combined Score": round(final_score * 100, 2)
    }

# Paths
resume_dir = "resumes/"
jd_path = "JDs/Data_engineering_intern_LiveRamp.txt"

with open(jd_path, 'r', encoding='utf-8') as f:
    jd_text = f.read()

results = []

for file in os.listdir(resume_dir):
    if not file.endswith(".pdf"):
        continue
    print(f"Processing: {file}")
    fpath = os.path.join(resume_dir, file)
    resume_text = extract_text_from_pdf(fpath)
    resume_sections = extract_sections(resume_text)
    scores = compute_scores(resume_text, resume_sections, jd_text)
    results.append({"Resume": file, **scores})

results_df = pd.DataFrame(results).sort_values("Combined Score", ascending=False)
results_df.reset_index(drop=True, inplace=True)
results_df

Processing: Dhruvraj_resume_Mar11.pdf
Processing: Dhruvraj_Resume_intern_rocket.pdf
Processing: Dhruvraj_resume_May18.pdf
Processing: Shefali_Bisht_Resume_BIE.pdf
Processing: Dhruvraj_resume_image_analytics.pdf
Processing: Dhruvraj_resume_USHunger.pdf
Processing: Dhruvraj_resume_MSDS.pdf


Unnamed: 0,Resume,BERT Score,Keyword Overlap,Combined Score
0,Shefali_Bisht_Resume_BIE.pdf,44.48,30.77,41.74
1,Dhruvraj_resume_Mar11.pdf,44.49,23.59,40.31
2,Dhruvraj_resume_MSDS.pdf,43.8,21.03,39.25
3,Dhruvraj_resume_May18.pdf,43.07,18.97,38.25
4,Dhruvraj_resume_USHunger.pdf,42.56,17.95,37.64
5,Dhruvraj_Resume_intern_rocket.pdf,42.26,16.92,37.19
6,Dhruvraj_resume_image_analytics.pdf,10.63,12.31,10.97







## Score of BERT, Similarity score by LLAMA3.2 and keyword overlap score combined:






In [19]:
import os, re, PyPDF2, pandas as pd, nltk, torch
from sentence_transformers import SentenceTransformer, util
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# ── one-time NLTK download ────────────────────────────────────────────────────
nltk.download('stopwords')

# ── models ───────────────────────────────────────────────────────────────────
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
llm        = ChatOllama(model='llama3.2')

# ── section headers & base weights (before dynamic re-allocation) ─────────────
SECTION_MAP = {
    "skills"    : ["skills", "technical skills", "skill set"],
    "experience": ["experience", "professional experience", "technical experience", "work experience"],
    "projects"  : ["projects", "academic projects", "personal projects"]
}
BASE_WEIGHTS = {          # will be copied & adjusted per-resume
    "skills"    : 0.10,
    "experience": 0.60,
    "projects"  : 0.20,
    "other"     : 0.10
}

STOPWORDS = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)

# ── helpers ───────────────────────────────────────────────────────────────────
def extract_text_from_pdf(path):
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        return " ".join(page.extract_text() or "" for page in reader.pages)

clean = lambda line: re.sub(r'[^A-Za-z0-9\s]', '', line).strip().lower()

def extract_sections(text: str) -> dict:
    sections, current = {k: "" for k in BASE_WEIGHTS}, "other"
    for raw in text.splitlines():
        line = clean(raw)
        if not line: continue
        for key, aliases in SECTION_MAP.items():
            if any(alias in line for alias in aliases):
                current = key; break
        sections[current] += raw + " "
    return sections

def keyword_overlap(jd, doc):
    jd_kw  = {w for w in re.findall(r'\w+', jd.lower())  if w not in STOPWORDS}
    doc_kw = {w for w in re.findall(r'\w+', doc.lower()) if w not in STOPWORDS}
    return len(jd_kw & doc_kw) / max(len(jd_kw), 1)

def llama_similarity(jd, exp_proj):
    prompt = (
        "You're a hiring assistant. Based only on the experience & projects below, "
        "rate the candidate's relevance to the job description on a scale 0-1. "
        "Respond with **only** the number.\n\n"
        f"Job Description:\n{jd}\n\nExperience & Projects:\n{exp_proj}"
    )
    reply = llm.invoke([HumanMessage(content=prompt)]).content
    match = re.search(r"(?:0?\.\d+|1(?:\.0+)?)", reply)
    return float(match.group()) if match else 0.0

def dynamic_weights(sec_dict):
    """Return a weight dict that shifts EXP→PROJ or PROJ→EXP if one is empty."""
    w = BASE_WEIGHTS.copy()
    has_exp = sec_dict["experience"].strip() != ""
    has_prj = sec_dict["projects"].strip()   != ""
    if not has_exp and has_prj:          # move 0.60 to projects
        w["projects"] += w["experience"]; w["experience"] = 0.0
    elif has_exp and not has_prj:        # move 0.20 to experience
        w["experience"] += w["projects"]; w["projects"]  = 0.0
    return w

def score_resume(fname, full_txt, secs, jd_txt):
    # ----- BERT section score -------------------------------------------------
    w = dynamic_weights(secs)
    jd_emb, bert_total = bert_model.encode(jd_txt, convert_to_tensor=True), 0.0
    for sec, txt in secs.items():
        if not txt.strip() or w[sec]==0: continue
        emb  = bert_model.encode(txt, convert_to_tensor=True)
        sim  = util.pytorch_cos_sim(emb, jd_emb).item()
        bert_total += w[sec] * sim      # weighted sum (still 0-1 range)

    # ----- keyword overlap ----------------------------------------------------
    kw_overlap = keyword_overlap(jd_txt, full_txt)

    # ----- LLaMA similarity (experience + projects) ---------------------------
    exp_proj   = secs["experience"] + "\n" + secs["projects"]
    llama_sc   = llama_similarity(jd_txt, exp_proj)

    # ----- final weighted combo ----------------------------------------------
    final = 0.60 * bert_total + 0.25 * llama_sc + 0.15 * kw_overlap
    return dict(
        Resume=fname,
        **{ "BERT Score": round(bert_total*100,2),
            "LLaMA Score": round(llama_sc*100,2),
            "Keyword Overlap": round(kw_overlap*100,2),
            "Combined Score": round(final*100,2) }
    )

# ── run pipeline ──────────────────────────────────────────────────────────────
RES_DIR, JD_FILE = "resumes/", "JDs/Data_engineering_intern_LiveRamp.txt"
jd_text = open(JD_FILE, encoding='utf-8').read()

results = []
for fn in os.listdir(RES_DIR):
    if not fn.endswith(".pdf"): continue
    print("→", fn)
    txt   = extract_text_from_pdf(os.path.join(RES_DIR, fn))
    secs  = extract_sections(txt)
    results.append(score_resume(fn, txt, secs, jd_text))

df = pd.DataFrame(results).sort_values("Combined Score", ascending=False).reset_index(drop=True)
df

[nltk_data] Downloading package stopwords to /Users/dhruv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


→ Dhruvraj_resume_Mar11.pdf
→ Dhruvraj_Resume_intern_rocket.pdf
→ Dhruvraj_resume_May18.pdf
→ Shefali_Bisht_Resume_BIE.pdf
→ Dhruvraj_resume_image_analytics.pdf
→ Dhruvraj_resume_USHunger.pdf
→ Dhruvraj_resume_MSDS.pdf


Unnamed: 0,Resume,BERT Score,LLaMA Score,Keyword Overlap,Combined Score
0,Dhruvraj_resume_May18.pdf,43.07,100.0,20.22,53.88
1,Dhruvraj_resume_USHunger.pdf,42.56,80.0,18.54,48.32
2,Dhruvraj_resume_Mar11.pdf,44.49,70.0,24.16,47.82
3,Dhruvraj_resume_MSDS.pdf,43.8,70.0,21.91,47.07
4,Shefali_Bisht_Resume_BIE.pdf,44.48,50.0,30.9,43.82
5,Dhruvraj_Resume_intern_rocket.pdf,42.26,50.0,17.98,40.55
6,Dhruvraj_resume_image_analytics.pdf,19.69,50.0,10.67,25.91


## score with BERT, LLAMA3.2 and keyword overlap with clean JD text for scoring:

In [None]:
## score with BERT, LLAMA3.2 and keyword overlap with clean JD text for scoring:

import os, re, PyPDF2, pandas as pd, nltk, torch
from sentence_transformers import SentenceTransformer, util
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download('stopwords')

bert_model = SentenceTransformer('all-MiniLM-L6-v2')
llm = ChatOllama(model='llama3.2')

SECTION_MAP = {
    "skills": ["skills", "technical skills", "skill set"],
    "experience": ["experience", "professional experience", "technical experience", "work experience"],
    "projects": ["projects", "academic projects", "personal projects"]
}
BASE_WEIGHTS = {
    "skills": 0.10,
    "experience": 0.50,
    "projects": 0.30,
    "other": 0.10
}

STOPWORDS = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)

JD_HEADERS = [
    "required qualifications", "preferred qualifications", "skills needed", "you will",
    "job responsibilities", "minimum requirements", "what you'll work on", "what you bring",
    "bonus point for", "key responsibilities", "requirements", "what you’ll do",
    "nice to have", "about you", "the following"
]

# ── helper functions ──────────────────────────────────────────────────────────
def extract_text_from_pdf(path):
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        return " ".join(page.extract_text() or "" for page in reader.pages)

def clean(line):
    return re.sub(r'[^A-Za-z0-9\s]', '', line).strip().lower()

def extract_sections(text: str) -> dict:
    sections, current = {k: "" for k in BASE_WEIGHTS}, "other"
    for raw in text.splitlines():
        line = clean(raw)
        if not line: continue
        for key, aliases in SECTION_MAP.items():
            if any(alias in line for alias in aliases):
                current = key; break
        sections[current] += raw + " "
    return sections

def keyword_overlap(jd, doc):
    jd_kw  = {w for w in re.findall(r'\w+', jd.lower())  if w not in STOPWORDS}
    doc_kw = {w for w in re.findall(r'\w+', doc.lower()) if w not in STOPWORDS}
    return len(jd_kw & doc_kw) / max(len(jd_kw), 1)

def llama_similarity(jd, exp_proj):
    prompt = (
        "You're a hiring assistant. Based only on the experience & projects below, "
        "rate the candidate's relevance to the job description on a scale 0-1. "
        "Respond with **only** the number.\n\n"
        f"Job Description:\n{jd}\n\nExperience & Projects:\n{exp_proj}"
    )
    reply = llm.invoke([HumanMessage(content=prompt)]).content
    match = re.search(r"(?:0?\.\d+|1(?:\.0+)?)", reply)
    return float(match.group()) if match else 0.0

def dynamic_weights(sec_dict):
    w = BASE_WEIGHTS.copy()
    has_exp = sec_dict["experience"].strip() != ""
    has_prj = sec_dict["projects"].strip() != ""
    if not has_exp and has_prj:
        w["projects"] += w["experience"]; w["experience"] = 0.0
    elif has_exp and not has_prj:
        w["experience"] += w["projects"]; w["projects"] = 0.0
    return w

def filter_jd_sections(text):
    lines = text.splitlines()
    collecting = False
    collected = []
    for line in lines:
        l = line.strip().lower()
        if any(header in l for header in JD_HEADERS):
            collecting = True
        if collecting:
            collected.append(line.strip())
    return "\n".join(collected)

def score_resume(fname, full_txt, secs, jd_txt):
    w = dynamic_weights(secs)
    jd_emb = bert_model.encode(jd_txt, convert_to_tensor=True)
    bert_total = 0.0
    for sec, txt in secs.items():
        if not txt.strip() or w[sec] == 0: continue
        emb = bert_model.encode(txt, convert_to_tensor=True)
        sim = util.pytorch_cos_sim(emb, jd_emb).item()
        bert_total += w[sec] * sim

    kw_overlap = keyword_overlap(jd_txt, full_txt)
    exp_proj = secs["experience"] + "\n" + secs["projects"]
    llama_sc = llama_similarity(jd_txt, exp_proj)
    final = 0.60 * bert_total + 0.25 * llama_sc + 0.15 * kw_overlap

    return dict(
        Resume=fname,
        **{
            "BERT Score": round(bert_total * 100, 2),
            "LLaMA Score": round(llama_sc * 100, 2),
            "Keyword Overlap": round(kw_overlap * 100, 2),
            "Combined Score": round(final * 100, 2)
        }
    )

# ── run pipeline ──────────────────────────────────────────────────────────────
RES_DIR, JD_FILE = "resumes/", "JDs/AI_intern_Armada.txt"#"JDs/Data_engineering_intern_LiveRamp.txt"
jd_raw = open(JD_FILE, encoding='utf-8').read()
jd_text = filter_jd_sections(jd_raw)
# print("\n📝 Filtered Job Description used for scoring:\n")
# print(jd_text)
# print("\n" + "="*80 + "\n")
results = []
for fn in os.listdir(RES_DIR):
    if not fn.endswith(".pdf"): continue
    print("→", fn)
    txt = extract_text_from_pdf(os.path.join(RES_DIR, fn))
    secs = extract_sections(txt)
    results.append(score_resume(fn, txt, secs, jd_text))

df = pd.DataFrame(results).sort_values("Combined Score", ascending=False).reset_index(drop=True)
df


[nltk_data] Downloading package stopwords to /Users/dhruv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



📝 Filtered Job Description used for scoring:

What You'll Do (Key Responsibilities)

Assist in building, training, and fine-tuning machine learning models.
Conduct research on AI trends, tools, and techniques.
Work with large datasets for data preprocessing, cleaning, and feature engineering.
Optimize and evaluate model performance using various metrics.
Support AI team members in deploying and integrating models into applications.
Write and document scripts, workflows, and processes.
Collaborate with cross-functional teams, including data engineers and software developers.
Stay updated on the latest AI advancements and research papers.

Required Qualifications

Pursuing or recently completed a degree in Computer Science, Data Science, Artificial Intelligence, or a related field.
Familiarity with programming languages such as Python, R, or Java.
Knowledge of AI/ML frameworks like TensorFlow, PyTorch, or Scikit-learn.
Experience with data manipulation using Pandas, NumPy, and SQL.
Unde

Unnamed: 0,Resume,BERT Score,LLaMA Score,Keyword Overlap,Combined Score
0,Dhruvraj_resume_MSDS.pdf,38.83,80.0,28.39,47.56
1,Dhruvraj_resume_May18.pdf,28.14,95.0,23.23,44.12
2,Dhruvraj_resume_USHunger.pdf,25.46,80.0,23.23,38.76
3,Dhruvraj_Resume_intern_rocket.pdf,28.24,70.0,21.94,37.74
4,Shefali_Bisht_Resume_BIE.pdf,35.62,50.0,25.81,37.74
5,Dhruvraj_resume_Mar11.pdf,29.52,60.0,23.87,36.29
6,Dhruvraj_resume_image_analytics.pdf,24.29,70.0,19.35,34.98
