In [1]:
# Cell 1: Imports
import re
import numpy as np
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


In [2]:
# Cell 2: PDF Text Extraction with Section Parsing
def extract_resume_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = "\n".join([page.extract_text() for page in reader.pages])
    
    sections = {
        'skills': [],
        'experience': [],
        'education': [],
        'projects': []
    }
    
    current_section = None
    for line in text.split('\n'):
        if re.match(r'(?i)^(skills|technical skills)', line):
            current_section = 'skills'
        elif re.match(r'(?i)^(experience|work history)', line):
            current_section = 'experience'
        elif re.match(r'(?i)^(education)', line):
            current_section = 'education'
        elif re.match(r'(?i)^(projects)', line):
            current_section = 'projects'
        elif current_section:
            sections[current_section].append(line.strip())
    
    full_text = text.replace('\n', ' ')
    section_texts = {k: ' '.join(v) for k, v in sections.items()}
    return full_text, section_texts


In [3]:
# Cell 3: Semantic Similarity Calculation
def calculate_semantic_similarity(text1, text2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode([text1, text2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]


In [4]:
# Cell 4: Keyword Matching Function
def calculate_keyword_match(text1, text2):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]


In [5]:
# Cell 5: Section Alignment Scoring
def calculate_section_alignment(jd_text, resume_sections):
    vectorizer = TfidfVectorizer(stop_words='english')
    jd_vector = vectorizer.fit_transform([jd_text])
    
    section_weights = {
        'skills': 0.4,
        'experience': 0.4,
        'education': 0.15,
        'projects': 0.05
    }
    
    total_score = 0
    for section, weight in section_weights.items():
        if section in resume_sections and resume_sections[section]:
            section_vector = vectorizer.transform([resume_sections[section]])
            similarity = cosine_similarity(jd_vector, section_vector)[0][0]
            total_score += similarity * weight
            
    return total_score


In [8]:
# Cell 6: Main Execution Cell
def main(resume_path, jd_path):
    with open(jd_path, 'r') as f:
        jd_text = f.read()
    
    resume_full, resume_sections = extract_resume_text(resume_path)
    
    semantic_score = calculate_semantic_similarity(resume_full, jd_text)
    keyword_score = calculate_keyword_match(resume_full, jd_text)
    section_score = calculate_section_alignment(jd_text, resume_sections)
    
    final_score = (0.7 * semantic_score) + (0.2 * keyword_score) + (0.1 * section_score)
    
    print(f"Semantic Similarity: {semantic_score:.2f}")
    print(f"Keyword Match: {keyword_score:.2f}")
    print(f"Section Alignment: {section_score:.2f}")
    print(f"\nFinal Match Score: {final_score:.2f}/1.0")



In [9]:
# Cell 7: Sample Execution (Uncomment to use)
main("resumes/Dhruvraj_resume_May18.pdf", "JDs/Data_engineering_intern_LiveRamp.txt")


Semantic Similarity: 0.45
Keyword Match: 0.28
Section Alignment: 0.61

Final Match Score: 0.43/1.0


## another code

In [10]:
import re
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer, util


In [11]:
# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [12]:
# Function to extract text from PDF (resume)
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text() + '\n'
    return text.strip()

# Function to extract text from text file (JD)
def extract_text_from_txt(txt_path):
    with open(txt_path, 'r', encoding='utf-8') as f:
        return f.read().strip()


In [13]:
def extract_sections(text, is_resume=True):
    sections = {}
    if is_resume:
        sections['skills'] = extract_skills(text)
        sections['experience'] = extract_section(text, ['experience', 'work'])
        sections['education'] = extract_section(text, ['education'])
        sections['certifications'] = extract_section(text, ['certification', 'certificate'])
        sections['projects'] = extract_section(text, ['project'])
    else:
        sections['required_skills'] = extract_section(text, ['required skills', 'requirements', 'about you'])
        sections['responsibilities'] = extract_section(text, ['responsibilities'])
        sections['qualifications'] = extract_section(text, ['qualifications', 'education'])
    return sections

def extract_section(text, keywords):
    pattern = '|'.join([fr"{k}" for k in keywords])
    matches = re.split(pattern, text, flags=re.IGNORECASE)
    return matches[1] if len(matches) > 1 else text

def extract_skills(text):
    skills = re.findall(r"(?i)\b[A-Za-z0-9\+\#\.]+\b", text)
    return ', '.join([s for s in skills if len(s) > 1])


In [14]:
def semantic_score(jd_sections, resume_sections, verbose=True):
    pairs = [
        ('required_skills', 'skills'),
        ('responsibilities', 'experience'),
        ('qualifications', 'education'),
    ]
    scores = []

    if verbose:
        print("\n📘 Semantic Similarity by Section:")

    for jd_key, res_key in pairs:
        jd_text = jd_sections.get(jd_key, '').strip()
        res_text = resume_sections.get(res_key, '').strip()

        if jd_text and res_text:
            jd_emb = model.encode(jd_text, convert_to_tensor=True)
            res_emb = model.encode(res_text, convert_to_tensor=True)
            sim = util.cos_sim(jd_emb, res_emb).item()
            scores.append(sim)

            if verbose:
                print(f"\n🔹 JD Section: {jd_key} ↔ Resume Section: {res_key}")
                print(f"Similarity Score: {round(sim, 4)}")
        else:
            if verbose:
                print(f"\n🔹 JD Section: {jd_key} ↔ Resume Section: {res_key}")
                print("⚠️ One of the sections is empty. Skipping similarity computation.")

    final_semantic = np.mean(scores) if scores else 0.0
    if verbose:
        print(f"\n✅ Final Semantic Score (average): {round(final_semantic, 4)}")

    return final_semantic



In [15]:
def keyword_score(jd_text, resume_text, verbose=True):
    jd_keywords = set(re.findall(r"\b[a-zA-Z0-9\+\#\.]{2,}\b", jd_text.lower()))
    resume_keywords = set(re.findall(r"\b[a-zA-Z0-9\+\#\.]{2,}\b", resume_text.lower()))
    matched_keywords = jd_keywords.intersection(resume_keywords)
    unmatched_keywords = jd_keywords - resume_keywords

    if verbose:
        print("\n🟦 JD Keywords:", sorted(list(jd_keywords)))
        print("🟩 Matched Keywords:", sorted(list(matched_keywords)))
        print("🟥 Unmatched Keywords:", sorted(list(unmatched_keywords)))

    score = len(matched_keywords) / len(jd_keywords) if jd_keywords else 0.0
    return score

def section_overlap_score(jd_sections, resume_sections, verbose=True):
    overlap = 0
    total = 0

    if verbose:
        print("\n📘 Section Overlap Debug:")

    for key in ['required_skills', 'responsibilities', 'qualifications']:
        jd_text = jd_sections.get(key, '').lower()
        res_key = 'skills' if key == 'required_skills' else (
            'experience' if key == 'responsibilities' else 'education')
        res_text = resume_sections.get(res_key, '').lower()

        jd_words = set(jd_text.split())
        res_words = set(res_text.split())

        matched = jd_words.intersection(res_words)
        section_score = len(matched) / len(jd_words) if jd_words else 0.0

        if verbose:
            print(f"\n🔹 JD Section: {key} | Resume Section: {res_key}")
            print(f"JD Words ({len(jd_words)}): {sorted(list(jd_words))[:20]}...")
            print(f"Resume Words ({len(res_words)}): {sorted(list(res_words))[:20]}...")
            print(f"Matched ({len(matched)}): {sorted(list(matched))[:20]}...")
            print(f"Section Score: {round(section_score, 4)}")

        overlap += section_score
        total += 1

    return overlap / total if total else 0.0



In [16]:
def final_score(jd_text, resume_text):
    jd_sections = extract_sections(jd_text, is_resume=False)
    resume_sections = extract_sections(resume_text, is_resume=True)

    print("\n================ SEMANTIC MATCHING =================")
    sem_score = semantic_score(jd_sections, resume_sections, verbose=True)

    print("\n================ KEYWORD MATCHING ==================")
    key_score = keyword_score(jd_text, resume_text, verbose=True)

    print("\n================ SECTION OVERLAP ===================")
    sect_score = section_overlap_score(jd_sections, resume_sections, verbose=True)

    final = (0.6 * sem_score) + (0.2 * key_score) + (0.2 * sect_score)
    print("\n================ FINAL SCORE =======================")
    return {
        "semantic_score": round(sem_score, 4),
        "keyword_score": round(key_score, 4),
        "section_overlap_score": round(sect_score, 4),
        "final_score": round(final, 4)
    }



In [28]:
# Run this with your file paths
resume_path = "resumes/Dhruvraj_resume_May18.pdf"  # Replace if renamed
jd_path = "JDs/Data_engineering_intern_LiveRamp.txt"                # Upload your JD as a .txt file

resume_text = extract_text_from_pdf(resume_path)
jd_text = extract_text_from_txt(jd_path)

result = final_score(jd_text, resume_text)
result




📘 Semantic Similarity by Section:

🔹 JD Section: required_skills ↔ Resume Section: skills
Similarity Score: 0.4013

🔹 JD Section: responsibilities ↔ Resume Section: experience
Similarity Score: 0.471

🔹 JD Section: qualifications ↔ Resume Section: education
Similarity Score: 0.4959

✅ Final Semantic Score (average): 0.4561


🟦 JD Keywords: ['ability', 'about', 'across', 'actionable', 'activating', 'analysis', 'analysts', 'analytical', 'analytics', 'and', 'are', 'as', 'assist', 'attention', 'bachelor', 'banks', 'between', 'big', 'brand', 'brands', 'build', 'building', 'business', 'but', 'by', 'capabilities', 'cases', 'choice', 'clarity', 'collaborate', 'collaboration', 'collaboratively', 'companies', 'complete', 'complex', 'compliance', 'computer', 'concepts', 'connected', 'consumer', 'context', 'contribute', 'contributing', 'cross', 'currently', 'customer', 'dashboards', 'data', 'database', 'deepening', 'degree', 'design', 'designing', 'detail', 'developing', 'driven', 'enabling', 'e

{'semantic_score': 0.4561,
 'keyword_score': 0.2115,
 'section_overlap_score': 0.095,
 'final_score': 0.335}