In [1]:
import os
import re

# Define required skills from job description
required_skills = [
    "python", "django", "rest", "api", "aws", "git", "sql", "communication"
]

def clean_text(text):
    # Lowercase and remove non-word characters to match skills
    return re.findall(r'\b\w+\b', text.lower())

def load_job_description(filepath):
    # Load and clean job description text
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    return clean_text(content)

def load_resume(filepath):
    # Load and clean resume text
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    return clean_text(content)

def match_score(resume_words, required_skills):
    # Calculate how many required skills are matched in the resume
    resume_set = set(resume_words)
    matched = [skill for skill in required_skills if skill in resume_set]
    return len(matched), matched

def get_missing_skills(resume_words, required_skills):
    # Get skills that are missing in the resume
    resume_set = set(resume_words)
    missing = [skill for skill in required_skills if skill not in resume_set]
    return missing

def main():
    job_words = load_job_description("job.txt")
    print("Matching resumes based on required skills...\n")
    
    result = []
    for filename in os.listdir("resumes"):
        if filename.endswith(".txt"):
            path = os.path.join("resumes", filename)
            resume_words = load_resume(path)
            score, matched_skills = match_score(resume_words, required_skills)
            missing_skills = get_missing_skills(resume_words, required_skills)
            percent = round((score / len(required_skills)) * 100, 2)
            result.append((filename, percent, matched_skills, missing_skills))
    
    # Sort by highest match percentage
    result.sort(key=lambda x: x[1], reverse=True)
    
    print("\nDetailed report:\n")
    for i, (filename, percent, matched_skills, missing_skills) in enumerate(result, 1):
        print(f"Resume {i}: {filename}")
        print(f"  Match Percentage: {percent}%")
        print(f"  Matched Skills: {', '.join(matched_skills)}")
        print(f"  Missing Skills: {', '.join(missing_skills) if missing_skills else 'None'}\n")
    
    # Adding ranking system
    print("\nRanking based on match percentage:")
    for i, (filename, percent, matched_skills, missing_skills) in enumerate(result, 1):
        print(f"Rank {i}: {filename} – {percent}% match")
        print(f"   Matched skills: {', '.join(matched_skills)}")
        print(f"   Missing skills: {', '.join(missing_skills) if missing_skills else 'None'}\n")

if __name__ == "__main__":
    main()


Matching resumes based on required skills...


Detailed report:

Resume 1: resume2.txt
  Match Percentage: 100.0%
  Matched Skills: python, django, rest, api, aws, git, sql, communication
  Missing Skills: None

Resume 2: resume1.txt
  Match Percentage: 87.5%
  Matched Skills: python, django, rest, aws, git, sql, communication
  Missing Skills: api

Resume 3: resume3.txt
  Match Percentage: 87.5%
  Matched Skills: python, django, rest, aws, git, sql, communication
  Missing Skills: api


Ranking based on match percentage:
Rank 1: resume2.txt – 100.0% match
   Matched skills: python, django, rest, api, aws, git, sql, communication
   Missing skills: None

Rank 2: resume1.txt – 87.5% match
   Matched skills: python, django, rest, aws, git, sql, communication
   Missing skills: api

Rank 3: resume3.txt – 87.5% match
   Matched skills: python, django, rest, aws, git, sql, communication
   Missing skills: api



In [3]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
    --------------------------------------- 0.3/18.7 MB ? eta -:--:--
   - -------------------------------------- 0.8/18.7 MB 3.4 MB/s eta 0:00:06
   --- ------------------------------------ 1.6/18.7 MB 3.1 MB/s eta 0:00:06
   ---- ----------------------------------- 2.1/18.7 MB 2.9 MB/s eta 0:00:06
   ----- ---------------------------------- 2.6/18.7 MB 3.0 MB/s eta 0:00:06
   ------ --------------------------------- 3.1/18.7 MB 2.9 MB/s eta 0:00:06
   ------- -------------------------------- 3.4/18.7 MB 2.8 MB/s eta 0:00:06
   -------- ------------------------------- 3.9/18.7 MB 2.6 MB/s eta 0:00:06
   ---------- ----------------------------- 4.7/18.7 MB 2.6 MB/s eta 0:00:06
   ---------- ----------------------------- 5.0/18.7 MB 2.6 MB/s eta 0:00:06
   ----------- -

    extract-msg (<=0.29.*)
                 ~~~~~~~^

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import fitz  # PyMuPDF
from docx import Document

def extract_text(filepath):
    text = ""
    if filepath.endswith(".pdf"):
        with fitz.open(filepath) as pdf:
            text = "\n".join(page.get_text() for page in pdf)
    elif filepath.endswith(".docx"):
        doc = Document(filepath)
        text = "\n".join(para.text for para in doc.paragraphs)
    elif filepath.endswith(".txt"):
        with open(filepath, "r", encoding="utf-8") as file:
            text = file.read()
    else:
        raise ValueError("Unsupported file type.")
    return text



import re

def extract_contact_details(text):
    email = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone = re.findall(r'\b(?:\+?\d{1,3})?[-.\s]?\(?\d{2,4}\)?[-.\s]?\d{3,5}[-.\s]?\d{3,5}\b', text)
    
    # Name and Address (naive approach)
    lines = text.split('\n')
    possible_name = lines[0].strip()
    possible_address = next((line for line in lines if "address" in line.lower()), "Not Found")

    return {
        "name": possible_name,
        "email": email[0] if email else "Not Found",
        "phone": phone[0] if phone else "Not Found",
        "address": possible_address
    }


def extract_education(text):
    education_keywords = ["b.tech", "bachelor", "m.tech", "master", "phd", "msc", "bsc", "mba",
                          "diploma", "degree"]
    education_found = []
    for line in text.lower().split('\n'):
        if any(keyword in line for keyword in education_keywords):
            education_found.append(line.strip())

    return {
        "total_qualifications": len(education_found),
        "qualifications": education_found
    }



def extract_experience(text):
    domain_keywords = ["software", "developer", "manager", "sales", "marketing", "data science",
                       "machine learning", "finance"]
    experience_list = []
    
    for line in text.lower().split('\n'):
        if "experience" in line or any(domain in line for domain in domain_keywords):
            experience_list.append(line.strip())

    return {
        "total_experiences": len(experience_list),
        "experiences": experience_list
    }




file_path = "resumes/resume2.pdf"
resume_text = extract_text(file_path)

contact = extract_contact_details(resume_text)
education = extract_education(resume_text)
experience = extract_experience(resume_text)

print("Candidate Details:", contact)
print("Education:", education)
print("Experience:", experience)


Candidate Details: {'name': 'Jason Miller', 'email': 'email@email.com', 'phone': '\n3868683442', 'address': 'Not Found'}
Education: {'total_qualifications': 2, 'qualifications': ['participating in kaizen events, gemba walks, and 5s to remove barriers', 'associates degree in logistics and supply chain fundamentals,']}
Experience: {'total_experiences': 1, 'experiences': ['experienced amazon associate with five years’ tenure in a shipping yard']}
