In [14]:
import os
import PyPDF2
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
import json
import re

load_dotenv()
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
llm = ChatOllama(model="deepseek-r1:7b")

In [20]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        return " ".join(page.extract_text() or "" for page in reader.pages)

def extract_sections(resume_text):
    sections = {"projects": "", "experience": ""}
    current = None
    for line in resume_text.splitlines():
        l = line.lower().strip()
        if "experience" in l:
            current = "experience"
        elif "project" in l:
            current = "projects"
        elif any(x in l for x in ["education", "skills", "summary", "technical", "certification"]):
            current = None
        if current:
            sections[current] += line.strip() + " "
    return sections

def compute_bert_score(resume_text, jd_text):
    emb1 = bert_model.encode(resume_text, convert_to_tensor=True)
    emb2 = bert_model.encode(jd_text, convert_to_tensor=True)
    return round(util.pytorch_cos_sim(emb1, emb2).item() * 100, 2)

def compute_deepseek_score_single_call(jd_text, experience_text, project_text):
    prompt = f"""
You are a resume evaluator.

Given the job description and the candidate's resume sections below, score the candidate on a scale of 0–100 for each section separately:
1. Experience
2. Projects

Return only the two scores in JSON format as:
{{"experience_score": 88, "project_score": 76}}

Job Description:
\"\"\"{jd_text}\"\"\"

Experience Section:
\"\"\"{experience_text[:1500]}\"\"\"

Project Section:
\"\"\"{project_text[:1500]}\"\"\"
"""
    try:
        response = llm.invoke(prompt)
        reply = response.content.strip()
        print("DeepSeek raw response:", reply)

        try:
            json_start = reply.find('{')
            json_end = reply.rfind('}') + 1
            json_str = reply[json_start:json_end]
            scores = json.loads(json_str)
            exp_score = float(scores.get("experience_score", 0))
            proj_score = float(scores.get("project_score", 0))
        except Exception as e:
            print("Fallback to regex due to JSON error:", e)
            exp_match = re.search(r'experience[^:]*[:\-]?\s*(\d{1,3})', reply, re.I)
            proj_match = re.search(r'project[^:]*[:\-]?\s*(\d{1,3})', reply, re.I)
            exp_score = float(exp_match.group(1)) if exp_match else 0
            proj_score = float(proj_match.group(1)) if proj_match else 0

        return round((exp_score + proj_score) / 2, 2)

    except Exception as e:
        print("DeepSeek scoring error:", e)
        return 0


def combine_scores(bert_score, llm_score, w1=0.6, w2=0.4):
    return round(w1 * bert_score + w2 * llm_score, 2)

resume_path = "resumes/Dhruvraj_resume_May18.pdf"
resume_text = extract_text_from_pdf(resume_path)
print("Resume Text:\n", resume_text[:2000], "\n...")

jd_path = "JDs/Data_engineering_intern_LiveRamp.txt"
with open(jd_path, 'r', encoding='utf-8') as f:
    jd_text = f.read()
print("JD Text:\n", jd_text)

Resume Text:
 Dhruvraj Singh Rathore
/ne(737)206-1179 |dhruvrajrathore2011@gmail.com |Linkedin |/gtbGithub
Education
Texas A&M University Aug. 2024 – Dec. 2025
Master of Science in Data Science, CGPA: 4.0 College Station, TX
SRM Institute of Science and Technology Jul. 2018 – May 2022
Bachelor of Technology in Computer Science, CGPA: 3.8 Chennai, India
Technical Skills
Programming & Data Science : Python, SQL, Pandas, NumPy, Matplotlib, Scikit-learn, Shell Script
Databases & Cloud Computing : MySQL, NoSQL, Redis, MongoDB, AWS Suite (EMR, S3, EC2, Lambda)
Big Data & Machine Learning : Spark, PySpark, LLMs (Large Language Models), BERT, Llama3.2, LangChain,
Predictive Analytics, AWS Sagemaker, RAG
Tools & Platforms : Git/GitHub, CI/CD, Apache Airflow, Docker, Power BI, SnowFlake, Data Built Tool
Experience
Data Analyst Dec. 2022 – Jun. 2024
Draup Business Solutions Bangalore, India
•Designed and deployed high-performance ETL pipelines using PySpark and SQL on AWS EMR , improving data
int

In [21]:
bert_score = compute_bert_score(resume_text, jd_text)

sections = extract_sections(resume_text)
experience_text = sections.get("experience", "")
project_text = sections.get("projects", "")

deepseek_score = compute_deepseek_score_single_call(jd_text, experience_text, project_text)

final_score = combine_scores(bert_score, deepseek_score)## taking 36 seconds

DeepSeek raw response: <think>
Alright, I'm going to evaluate this candidate's experience and projects sections based on the job description provided.

First, looking at the Experience section: The candidate has two roles—Data Analyst and Data Scientist. They worked with PySpark and SQL, which aligns well with LiveRamp's ETL needs. The mention of Apache Airflow is a plus since that's relevant for data quality monitoring. Implementing OLAP models sounds beneficial for their analytical skills, though I should check if the performance improvements are clearly tied to data engineering tasks.

The transition from Data Analyst to Data Scientist shows growth and versatility. Using Python effectively in both roles adds value beyond SQL. The automation of JIRA tasks is a good sign for workflow efficiency, which ties into LiveRamp's need for streamlined processes. However, I'm not sure about their experience with big data tools like Spark or Kafka yet since the job emphasizes those.

Now moving 

In [22]:
df = pd.DataFrame([{
    "resume_file": os.path.basename(resume_path),
    "jd_file": os.path.basename(jd_path),
    "semantic_BERT_score": bert_score,
    "DeepSeek_score": deepseek_score,
    "Final_Combined_Score": final_score
}])

print("\nFinal Scoring DataFrame:\n")
df


Final Scoring DataFrame:



Unnamed: 0,resume_file,jd_file,semantic_BERT_score,DeepSeek_score,Final_Combined_Score
0,Dhruvraj_resume_May18.pdf,Data_engineering_intern_LiveRamp.txt,44.75,82.5,59.85


In [19]:
df = pd.DataFrame([{
    "resume_file": os.path.basename(resume_path),
    "jd_file": os.path.basename(jd_path),
    "semantic_BERT_score": bert_score,
    "DeepSeek_score": deepseek_score,
    "Final_Combined_Score": final_score
}])

print("\nFinal Scoring DataFrame:\n")
df


Final Scoring DataFrame:



Unnamed: 0,resume_file,jd_file,semantic_BERT_score,DeepSeek_score,Final_Combined_Score
0,Dhruvraj_resume_May18.pdf,Data_engineering_intern_LiveRamp.txt,44.75,82.5,63.62


In [9]:
models = genai.list_models()
for m in models:
    print(m.name)
    

models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-04-17
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash-preview-04-17-thinking
models/gemini-2.5-pro-preview-05-06
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview

In [13]:
## to validate the gemini model
model = genai.GenerativeModel("models/gemini-1.5-pro")
chat = model.start_chat()
response = chat.send_message("Tell me a short story in exactly 50 words.")
print("Gemini response:\n")
print(response.text)

ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
violations {
}
violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 57
}
]