<a href="https://colab.research.google.com/github/cherypallysaisurya/ResuVerse/blob/main/R_P(Batch_Matching).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy langchain
!python -m spacy download en_core_web_lg
!pip install transformers python-docx PyPDF2 Pillow


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━

In [None]:
import os
import re
import logging
import json
from typing import Dict, Tuple, Set, List
from docx import Document
from PyPDF2 import PdfReader
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


class ResumeAnalyzer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")

    def extract_text(self, file_path: str) -> str:
        """Extract text from PDF or DOCX files"""
        try:
            if file_path.endswith(".pdf"):
                reader = PdfReader(file_path)
                text = []
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text.append(page_text)
                return "\n".join(text)
            elif file_path.endswith(".docx"):
                doc = Document(file_path)
                return "\n".join([para.text for para in doc.paragraphs])
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
        except Exception as e:
            logging.error(f"Error extracting text from {file_path}: {str(e)}")
            return ""

    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text"""
        text = re.sub(r"\S+@\S+", "", text)  # Remove emails
        text = re.sub(r"\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{3,4}", "", text)  # Remove phone numbers
        text = re.sub(r"[^\w\s.,-:]", " ", text)  # Remove special characters
        text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
        return text.lower()

    def extract_skills_from_job_description(self, text: str) -> Set[str]:
        """Extract skills from job description using NLP"""
        doc = self.nlp(text)
        skills = set()

        for chunk in doc.noun_chunks:
            skill = chunk.text.lower().strip()
            if (len(skill.split()) <= 3 and len(skill) >= 3 and
                not any(word.text.lower() in ["the", "a", "an", "this", "that", "these", "those"]
                        for word in chunk) and not chunk.root.pos_ in ["PRON", "DET", "ADP"]):
                skills.add(skill)

        for token in doc:
            if token.pos_ in ["PROPN"] or (token.text.isupper() and len(token.text) >= 2):
                skills.add(token.text.lower())

        for token in doc:
            if token.dep_ == "compound":
                compound = " ".join([token.text, token.head.text]).lower()
                if len(compound.split()) <= 3:
                    skills.add(compound)

        return skills

    def find_matching_skills(self, text: str, required_skills: Set[str]) -> Set[str]:
        """Find matching skills in resume"""
        doc = self.nlp(text)
        found_skills = set()
        text_lower = text.lower()

        for skill in required_skills:
            if skill.lower() in text_lower:
                found_skills.add(skill)
                continue

            skill_doc = self.nlp(skill)
            for chunk in doc.noun_chunks:
                if skill_doc.similarity(chunk) > 0.85:
                    found_skills.add(skill)
                    break

        return found_skills

    def calculate_cosine_similarity(self, text1: str, text2: str) -> float:
        """Compute cosine similarity between two texts"""
        vectorizer = TfidfVectorizer(stop_words="english")
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        similarity_score = float(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0])
        return similarity_score

    def analyze_resume(self, resume_text: str, job_description: str, filename: str) -> Dict:
        """Analyze resume against job description"""
        required_skills = self.extract_skills_from_job_description(job_description)
        matching_skills = self.find_matching_skills(resume_text, required_skills)
        missing_skills = required_skills - matching_skills

        similarity_score = self.calculate_cosine_similarity(resume_text, job_description)

        skill_match_ratio = len(matching_skills) / len(required_skills) if required_skills else 0
        is_fit = skill_match_ratio >= 0.7 and similarity_score > 0.6

        return {
            "filename": filename,
            "similarity_score": round(similarity_score * 100, 2),
            "skill_match_ratio": round(skill_match_ratio * 100, 2),
            "matching_skills": list(matching_skills),
            "missing_skills": list(missing_skills),
            "is_fit": is_fit
        }

    def process_multiple_resumes(self, resume_dir: str, job_description: str) -> List[Dict]:
        """Process and rank multiple resumes"""
        results = []

        job_description = self.preprocess_text(job_description)

        for filename in os.listdir(resume_dir):
            if filename.endswith((".pdf", ".docx")):
                try:
                    file_path = os.path.join(resume_dir, filename)
                    resume_text = self.extract_text(file_path)
                    resume_text = self.preprocess_text(resume_text)

                    resume_analysis = self.analyze_resume(resume_text, job_description, filename)
                    results.append(resume_analysis)

                except Exception as e:
                    logging.error(f"Error processing {filename}: {str(e)}")

        results.sort(key=lambda x: (x["similarity_score"], x["skill_match_ratio"]), reverse=True)

        return results


def main():
    analyzer = ResumeAnalyzer()
    resume_dir = input("Enter the path to the resume directory: ").strip()
    job_description = input("Enter the job description: ").strip()

    try:
        ranked_resumes = analyzer.process_multiple_resumes(resume_dir, job_description)

        print("\n=== Resume Ranking Results ===")
        for rank, resume in enumerate(ranked_resumes, 1):
            print(f"\nRank {rank}: {resume['filename']}")
            print(f"Similarity Score: {resume['similarity_score']}%")
            print(f"Skill Match Ratio: {resume['skill_match_ratio']}%")
            print(f"Fit for Position: {'Yes' if resume['is_fit'] else 'No'}")

            print("\nMatching Skills:")
            for skill in resume["matching_skills"]:
                print(f"  ✓ {skill}")

            print("\nMissing Skills:")
            for skill in resume["missing_skills"]:
                print(f"  ✗ {skill}")
            print("-" * 50)

        output_file = f"resume_ranking_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(ranked_resumes, f, indent=2)

        print(f"\nDetailed results saved to {output_file}")

        if ranked_resumes:
            best_resume = ranked_resumes[0]
            print("\n=== Best Matching Resume ===")
            print(f"Filename: {best_resume['filename']}")
            print(f"Similarity Score: {best_resume['similarity_score']}%")
            print(f"Skill Match Ratio: {best_resume['skill_match_ratio']}%")

    except Exception as e:
        print(f"Error: {str(e)}")


if __name__ == "__main__":
    main()


Enter the path to the resume directory: /content/test
Enter the job description: Enlyte is the parent brand of Mitchell, Genex and Coventry, an organization unlike any other in the Property & Casualty industry, bringing together three great businesses with a shared vision of using technology innovation, clinical services and network solutions to help our customers and the people they serve. Our suite of products and services enable our employees to help people recover from challenging life events, while providing opportunities for meaningful impact and career growth.    Responsibilities  We are building next generation suite of smart product solutions using Computer Vision, Advanced Analytics and Artificial Intelligence [Deep Learning]. We are looking for engineers and technologists to help build the next generation of systems, tools and features for our cutting-edge products and platforms that support millions of transactions. This team is the focal point in our work, bringing the lat

  if skill_doc.similarity(chunk) > 0.85:



=== Resume Ranking Results ===

Rank 1: Sai_surya_rakuten.pdf
Similarity Score: 27.5%
Skill Match Ratio: 16.79%
Fit for Position: No

Matching Skills:
  ✓ tools
  ✓ deep learning
  ✓ machine learning theory
  ✓ ability
  ✓ computer vision
  ✓ predictive models
  ✓ techniques
  ✓ deep
  ✓ data
  ✓ understanding
  ✓ models
  ✓ systems
  ✓ communication skills
  ✓ cnns
  ✓ machine learning techniques
  ✓ python
  ✓ r
  ✓ advanced analytics
  ✓ treescoding skills
  ✓ learning
  ✓ master s degree
  ✓ data science
  ✓ machine learning

Missing Skills:
  ✗ computer science
  ✗ genex
  ✗ high data diversity
  ✗ errors.generating predictions
  ✗ recall
  ✗ selection
  ✗ platforms
  ✗ opportunity
  ✗ generation suite
  ✗ data diversity
  ✗ ensembles
  ✗ mathematics
  ✗ our customers
  ✗ decisions
  ✗ high response
  ✗ opportunities
  ✗ related degreegpa
  ✗ build models
  ✗ algorithms
  ✗ life events
  ✗ property casualty
  ✗ engineers
  ✗ new algorithmic ideasa
  ✗ labelled outcomes
  ✗ people

In [None]:
import os
import re
import logging
import json
from typing import Dict, Tuple, Set, List
from docx import Document
from PyPDF2 import PdfReader
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


class ResumeAnalyzer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")

    def extract_text(self, file_path: str) -> str:
        """Extract text from PDF or DOCX files"""
        try:
            if file_path.endswith(".pdf"):
                reader = PdfReader(file_path)
                text = [page.extract_text() for page in reader.pages if page.extract_text()]
                return "\n".join(text)
            elif file_path.endswith(".docx"):
                doc = Document(file_path)
                return "\n".join([para.text for para in doc.paragraphs])
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
        except Exception as e:
            logging.error(f"Error extracting text from {file_path}: {str(e)}")
            return ""

    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess text"""
        text = re.sub(r"\S+@\S+", "", text)  # Remove emails
        text = re.sub(r"\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{3,4}", "", text)  # Remove phone numbers
        text = re.sub(r"[^\w\s.,-:]", " ", text)  # Remove special characters
        text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
        return text.lower()

    def extract_relevant_skills(self, text: str) -> Set[str]:
        """Extract technical skills using regex + NLP filtering"""
        # Predefined regex-based technical skill patterns
        skill_patterns = re.findall(
            r"\b(JavaScript|ReactJS|Python|Java|C\+\+|C#|TypeScript|SQL|T-SQL|.NET|ASP\.NET|"
            r"Node\.js|Azure|AWS|DevOps|OAuth2|JWT|REST|SOAP|GraphQL|Jenkins|Kubernetes|Docker|"
            r"Spring Boot|MongoDB|PostgreSQL|MySQL|Visual Studio|TFS|CI/CD|Agile|Jira|Scrum)\b",
            text, re.IGNORECASE
        )

        # Extract skills using NLP noun-chunk filtering
        doc = self.nlp(text)
        extracted_skills = {token.text.lower() for token in doc if token.pos_ in {"NOUN", "PROPN"}}

        # Filter out generic terms and keep relevant ones
        final_skills = {skill.lower() for skill in skill_patterns}.union(extracted_skills)
        return final_skills

    def find_matching_skills(self, resume_text: str, job_description: str) -> Tuple[Set[str], Set[str]]:
        """Find and match only relevant job skills"""
        required_skills = self.extract_relevant_skills(job_description)
        resume_skills = self.extract_relevant_skills(resume_text)

        matching_skills = resume_skills.intersection(required_skills)
        missing_skills = required_skills - matching_skills

        return matching_skills, missing_skills

    def calculate_cosine_similarity(self, text1: str, text2: str) -> float:
        """Compute cosine similarity between two texts"""
        vectorizer = TfidfVectorizer(stop_words="english")
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        return float(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0])

    def analyze_resume(self, resume_text: str, job_description: str, filename: str) -> Dict:
        """Analyze resume against job description"""
        matching_skills, missing_skills = self.find_matching_skills(resume_text, job_description)
        similarity_score = self.calculate_cosine_similarity(resume_text, job_description)

        skill_match_ratio = len(matching_skills) / len(matching_skills.union(missing_skills)) if missing_skills else 1
        is_fit = skill_match_ratio >= 0.7 and similarity_score > 0.6

        return {
            "filename": filename,
            "similarity_score": round(similarity_score * 100, 2),
            "skill_match_ratio": round(skill_match_ratio * 100, 2),
            "matching_skills": list(matching_skills),
            "missing_skills": list(missing_skills),
            "is_fit": is_fit
        }

    def process_multiple_resumes(self, resume_dir: str, job_description: str) -> List[Dict]:
        """Process and rank multiple resumes"""
        results = []
        job_description = self.preprocess_text(job_description)

        for filename in os.listdir(resume_dir):
            if filename.endswith((".pdf", ".docx")):
                try:
                    file_path = os.path.join(resume_dir, filename)
                    resume_text = self.extract_text(file_path)
                    resume_text = self.preprocess_text(resume_text)

                    resume_analysis = self.analyze_resume(resume_text, job_description, filename)
                    results.append(resume_analysis)

                except Exception as e:
                    logging.error(f"Error processing {filename}: {str(e)}")

        results.sort(key=lambda x: (x["similarity_score"], x["skill_match_ratio"]), reverse=True)
        return results


def main():
    analyzer = ResumeAnalyzer()
    resume_dir = input("Enter the path to the resume directory: ").strip()
    job_description = input("Enter the job description: ").strip()

    try:
        ranked_resumes = analyzer.process_multiple_resumes(resume_dir, job_description)

        print("\n=== Resume Ranking Results ===")
        for rank, resume in enumerate(ranked_resumes, 1):
            print(f"\nRank {rank}: {resume['filename']}")
            print(f"Similarity Score: {resume['similarity_score']}%")
            print(f"Skill Match Ratio: {resume['skill_match_ratio']}%")
            print(f"Fit for Position: {'Yes' if resume['is_fit'] else 'No'}")

            print("\nMatching Skills:")
            for skill in resume["matching_skills"]:
                print(f"  ✓ {skill}")

            print("\nMissing Skills:")
            for skill in resume["missing_skills"]:
                print(f"  ✗ {skill}")
            print("-" * 50)

        output_file = f"resume_ranking_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(ranked_resumes, f, indent=2)

        print(f"\nDetailed results saved to {output_file}")

        if ranked_resumes:
            best_resume = ranked_resumes[0]
            print("\n=== Best Matching Resume ===")
            print(f"Filename: {best_resume['filename']}")
            print(f"Similarity Score: {best_resume['similarity_score']}%")
            print(f"Skill Match Ratio: {best_resume['skill_match_ratio']}%")

    except Exception as e:
        print(f"Error: {str(e)}")


if __name__ == "__main__":
    main()


Enter the path to the resume directory: /content/drive/MyDrive/resumes
Enter the job description: Posting Title: Software Developer - .NET and ReactJS The role of the Software Developer is to assist in providing technical and operational support to projects and programs. This role involves working under direct supervision to maintains, adapts and updates existing systems to meet user requirements and to enhance program efficiency. Acentra Health supports a high-volume healthcare data system that is accessed through multiple web portals.  What you will do: •	Maintains, adapts, and updates existing systems to meet user requirements and to enhance program efficiency. Researches and documents user requirements and system specifications. •	Designs and develops program logic addressing specific programming needs. Translates business requirements and functional specifications into detailed system and program design specifications. Assumes responsibility for program design activities including