In [None]:
%pip install pypdf2
%pip install pdfplumber
%pip install pymupdf
%pip install langchain langchain-core langchain-community
%pip install  langchain-google-genai



# 1. Creating a PDF Extractor

In [None]:
import pdfplumber
import PyPDF2
from pathlib import Path
from typing import Dict, Optional

class PDFParser:
    """Extract raw text from PDF files"""

    def __init__(self):
        pass

    def extract_text_pdfplumber(self, pdf_path: Path) -> str:
        """Extract text using pdfplumber (better for tables)"""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                text = ""
                for page in pdf.pages:
                    text += page.extract_text() or ""
                return text.strip()
        except Exception as e:
            print(f"Error with pdfplumber: {e}")
            return ""

    def extract_text_pypdf2(self, pdf_path: Path) -> str:
        """Fallback: Extract text using PyPDF2"""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() or ""
                return text.strip()
        except Exception as e:
            print(f"Error with PyPDF2: {e}")
            return ""

    def extract_text(self, pdf_path: Path) -> Dict[str, any]:
        """Main extraction method with fallback"""
        pdf_path = Path(pdf_path)

        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF not found: {pdf_path}")

        text = self.extract_text_pdfplumber(pdf_path)

        if not text or len(text) < 50:
            text = self.extract_text_pypdf2(pdf_path)

        if not text or len(text) < 50:
            raise ValueError(f"Failed to extract text from {pdf_path}")

        return {
            "filename": pdf_path.name,
            "raw_text": text,
            "text_length": len(text),
            "success": True
        }

In [None]:
parser = PDFParser()
extracted_cv = parser.extract_text("Jillani Resume.pdf")


In [None]:
print(f"Extracted {extracted_cv['text_length']} characters")

Extracted 13919 characters


In [None]:
extracted_cv

{'filename': 'Jillani Resume.pdf',
 'raw_text': 'Muhammad Ghulam Jillani\nLinkedIn | +92-321-1174167 | +92-321-1179584 | JillaniPortfolio.com | m.g.jillani123@gmail.com | Kaggle | GitHub | Medium\nProfessional Summary __________________________________________________________________________________\nSenior Data Scientist and Machine Learning Engineer specializing in Generative AI, LLMs, and Autonomous AI Systems, with a proven record\nof transforming SaaS and PaaS platforms through innovative enterprise AI and agentic AI solutions. Expertise in optimizing workflows,\nstreamlining data pipelines, and building scalable AI architectures to solve complex business challenges. Recognized as a 24x LinkedIn Top\nVoice, Top 100 Global Kaggle Master, and KaggleX BIPOC Mentor, contributing to the NVIDIA Developer Program, Google Developer Group,\nand AWS AI Community. Demonstrated leadership in AI-driven product innovation, LLMOps strategies, and multimodal AI, delivering impactful\nresults with

# 2. Setting up the extractor

## 2.1 Defining the Model for Resume Information

In [None]:
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
import json

In [None]:
class Location(BaseModel):
    city: Optional[str] = None
    countryCode: Optional[str] = None
    region: Optional[str] = None

class WorkExperience(BaseModel):
    company: str
    position: str
    startDate: Optional[str] = None
    endDate: Optional[str] = None
    summary: Optional[str] = None
    highlights: Optional[List[str]] = []

class Education(BaseModel):
    institution: str
    degree: Optional[str] = None
    field: Optional[str] = None
    startDate: Optional[str] = None
    endDate: Optional[str] = None
    gpa: Optional[str] = None

class CVData(BaseModel):
    name: str = Field(description="Full name of the candidate")
    email: Optional[str] = Field(description="Email address")
    phone: Optional[str] = Field(description="Phone number")
    location: Optional[Location] = None
    summary: Optional[str] = Field(description="Professional summary or objective")
    work: List[WorkExperience] = Field(default_factory=list)
    education: List[Education] = Field(default_factory=list)
    skills: List[str] = Field(default_factory=list)
    languages: List[str] = Field(default_factory=list)
    certifications: List[str] = Field(default_factory=list)
    total_experience: float = Field(default_factory=float)

## 2.2 Creating the extractor

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from google.colab import userdata
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = userdata.get("GEMINI_API_KEY")

class CVExtractor:
    """LLM-based CV information extraction"""

    def __init__(self, llm_model: str = "models/gemini-2.5-flash-lite"):
        self.llm = ChatGoogleGenerativeAI(
            model=llm_model,
            temperature=1.0,
            max_tokens=None,
            timeout=None,
            max_retries=2,
        )
        self.parser = PydanticOutputParser(pydantic_object=CVData)

        self.prompt = ChatPromptTemplate.from_messages([
            ("system", """You are an expert CV/Resume parser. Extract structured information from resumes.
            Be thorough but accurate. If information is not present, leave fields empty.

            {format_instructions}"""),
            ("user", "Extract information from this CV:\n\n{cv_text}")
        ])

    def extract(self, cv_text: str) -> Dict:
        """Extract structured data from CV text"""

        chain = self.prompt | self.llm | self.parser

        try:
            result = chain.invoke({
                "cv_text": cv_text,
                "format_instructions": self.parser.get_format_instructions()
            })

            cv = result.model_dump()
            cv['total_experience'] = self.total_experience(cv['work'])

            return cv

        except Exception as e:
            print(f"Extraction error: {e}")
            return self._fallback_extraction(cv_text)

    def parse_date(self, date_str):
      """Parse date string into datetime object.
        Supports MM/YYYY, YYYY-MM-DD, and 'Currently' or invalid strings."""
      if not date_str or date_str.strip().lower() in {"currently", "present"}:
          return datetime.now()

      for fmt in ("%m/%Y", "%Y-%m-%d", "%d/%m/%Y", "%Y/%m/%d"):
          try:
              return datetime.strptime(date_str, fmt)
          except ValueError:
              continue
      # If all parsing fails, treat as None
      return None

    def total_experience(self, work_list):
      """Calculate total experience in years including days."""
      total_days = 0

      for job in work_list:
          start = self.parse_date(job.get("startDate"))
          end = self.parse_date(job.get("endDate"))

          if start and end:
              delta = end - start
              total_days += delta.days

      # Convert days to years
      total_years = total_days / 365.25  # accounts for leap years
      return round(total_years, 2)

    def _fallback_extraction(self, cv_text: str) -> Dict:
        """Simple fallback extraction"""
        messages = [
            {"role": "system", "content": "Extract CV information as JSON. Include name, email, phone, skills, work experience, education."},
            {"role": "user", "content": cv_text}
        ]

        response = self.llm.invoke(messages)

        try:
            json_str = response.content.strip()
            if "```json" in json_str:
                json_str = json_str.split("```json")[1].split("```")[0]
            return json.loads(json_str)
        except:
            return {"name": "Unknown", "raw_text": cv_text}


In [None]:
extractor = CVExtractor()
cv_json = extractor.extract(extracted_cv)

In [None]:
print(json.dumps(cv_json, indent=2))

{
  "name": "Muhammad Ghulam Jillani",
  "email": "m.g.jillani123@gmail.com",
  "phone": "+92-321-1174167, +92-321-1179584",
  "location": {
    "city": "New York",
    "countryCode": "US",
    "region": "United States"
  },
  "summary": "Senior Data Scientist and Machine Learning Engineer specializing in Generative AI, LLMs, and Autonomous AI Systems, with a proven record\nof transforming SaaS and PaaS platforms through innovative enterprise AI and agentic AI solutions. Expertise in optimizing workflows,\nstreamlining data pipelines, and building scalable AI architectures to solve complex business challenges. Recognized as a 24x LinkedIn Top\nVoice, Top 100 Global Kaggle Master, and KaggleX BIPOC Mentor, contributing to the NVIDIA Developer Program, Google Developer Group,\nand AWS AI Community. Demonstrated leadership in AI-driven product innovation, LLMOps strategies, and multimodal AI, delivering impactful\nresults with cutting-edge technologies across industries.",
  "work": [
   

## 2.3 Creating the vector embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple

class VectorStore:
    """Manage embeddings and vector similarity search with cosine similarity"""

    def __init__(self, embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"):
        model_name = embedding_model
        self.model = SentenceTransformer(model_name)
        self.dimension = self.model.get_sentence_embedding_dimension()

    def generate_embedding(self, text: str) -> List[float]:
        """Generate embedding for text"""
        embedding = self.model.encode(text, normalize_embeddings=True)
        return embedding.tolist()

    def generate_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for multiple texts"""
        embeddings = self.model.encode(texts, normalize_embeddings=True, batch_size=32)
        return embeddings.tolist()


    def cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
        """Calculate cosine similarity between two vectors"""
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        return float(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))

In [None]:
store = VectorStore()

In [None]:
cv_json_string = json.dumps(cv_json)
cv_embeddings = store.generate_embedding(cv_json_string)

In [None]:
len(cv_embeddings)

384

In [None]:
cv_json["embedding"] = cv_embeddings

## 2.4 Job Recommender

In [None]:
class JobRecommender:
    """Match CVs with relevant jobs"""

    def __init__(self):
        self.vector_store = VectorStore()

    def calculate_skills_match(self, cv_skills: List[str], job_skills: List[str]) -> Dict:
        """Calculate skill overlap"""
        cv_skills_lower = [s.lower() for s in cv_skills]
        job_skills_lower = [s.lower() for s in job_skills]

        matched = [s for s in job_skills_lower if s in cv_skills_lower]
        missing = [s for s in job_skills_lower if s not in cv_skills_lower]

        score = len(matched) / len(job_skills_lower) if job_skills_lower else 0

        return {
            "score": round(score, 3),
            "matched_skills": matched,
            "missing_skills": missing
        }

    def calculate_experience_match(self, candidate_exp: int, required_years) -> float:
        """Calculate experience match score"""
        min_exp, max_exp = required_years

        if max_exp is None:
          if candidate_exp >= min_exp:
              return 1.0

        if min_exp <= candidate_exp <= max_exp:
            return 1.0

        elif candidate_exp < min_exp:
            return max(0, 1 - (min_exp - candidate_exp) / (min_exp))

        elif candidate_exp > max_exp:
            return max(0, 1 - (candidate_exp - max_exp) / (max_exp))


    def calculate_education_match(self, cv_education: List[Dict], required_edu: dict) -> tuple:
        """Calculate education match with detailed explanation"""

        score = 0.0

        lines = []

        for edu in cv_education:
            institution = edu.get("institution", "")
            degree = edu.get("degree", "")
            field = edu.get("field", "")
            gpa = f"GPA: {edu['gpa']}" if edu.get("gpa") else ""
            start = edu.get("startDate") or ""
            end = edu.get("endDate") or ""
            date_range = f"{start} to {end}".strip() if start or end else ""

            line = " ".join(filter(None, [institution, degree, field, gpa, date_range]))
            lines.append(line)

        candidate_education = "\n".join(lines)
        print(required_edu)
        required_degree = required_edu.get("required_degree", "")
        restriction = required_edu.get("degree_restriction", "")
        required_field = required_edu.get("required_field", "")

        job_parts = []

        if required_degree:
            degree_part = f"Required Degree {required_degree}"
            if restriction:
                degree_part += f" (Restriction: {restriction})"
            job_parts.append(degree_part)

        if required_field:
            job_parts.append(f"Required Field {required_field}")

        job_education = " ".join(job_parts)

        candidate_education_embeddings = self.vector_store.generate_embedding(candidate_education)
        job_education_embeddings = self.vector_store.generate_embedding(job_education)

        score = self.vector_store.cosine_similarity(candidate_education_embeddings, job_education_embeddings)

        return score

    def match_cv_to_jobs(self, cv_data: Dict, jobs: List[Dict], top_k: int = 10) -> List[Dict]:
        """Generate job recommendations for a CV"""
        recommendations = []

        for job in jobs:
            semantic_score = self.vector_store.cosine_similarity(
                cv_data['embedding'],
                job['embedding']
            )

            skills_match = self.calculate_skills_match(
                cv_data.get('skills', []),
                job.get('skills', [])
            )

            cv_years = cv_data.get("total_experience", 0)
            exp_score = self.calculate_experience_match(
                cv_years,
                job.get('experience_years', 0)
            )

            edu_score = self.calculate_education_match(
                cv_data.get('education', []),
                job.get('education', '')
            )

            match_score = (
                0.45 * semantic_score +
                0.30 * skills_match['score'] +
                0.15 * exp_score +
                0.10 * edu_score
            )

            explanation = self._generate_explanation(
                match_score, skills_match, cv_years, job.get('experience_years', 0)
            )

            recommendations.append({
                #"job_id": job['id'],
                "job_title": job['title'],
                "company": job['company'],
                "match_score": round(match_score, 3),
                "matching_factors": {
                    "skills_match": round(skills_match['score'], 3),
                    "experience_match": round(exp_score, 3),
                    "education_match": round(edu_score, 3),
                    "semantic_similarity": round(semantic_score, 3)
                },
                "matched_skills": skills_match['matched_skills'],
                "missing_skills": skills_match['missing_skills'],
                "explanation": explanation
            })

        recommendations.sort(key=lambda x: x['match_score'], reverse=True)
        return recommendations[:top_k]

    def _generate_explanation(self, match_score: float, skills_match: Dict,
                            cv_years: int, required_years: int) -> str:
        """Generate human-readable explanation"""
        if match_score >= 0.9:
            return f"Excellent match with {len(skills_match['matched_skills'])} matching skills and {cv_years}+ years experience"
        elif match_score >= 0.7:
            return f"Strong match with {len(skills_match['matched_skills'])} core skills aligned"
        else:
            return f"Potential match but may need development in {len(skills_match['missing_skills'])} areas"

In [None]:
# dummy data
job_list = [
    {
        "title": "Software Developer",
        "company": "TechNova Solutions",
        "skills": ["JavaScript", "React", "Node.js", "Git", "Docker"],
        "education": {
            "required_degree": "Bachelor",
            "degree_restriction": "minimum",
            "required_field": "Computer Science"
        },
        "experience_years": [2, 4],
        "description": "Develop and maintain web applications, collaborate with the design team to build interactive user interfaces, and write clean, scalable code. Strong experience with modern JavaScript frameworks like React and Node.js is required."
    },
    {
        "title": "AI Engineer",
        "company": "TechNova Solutions",
        "skills": ["Chroma", "LLM", "Prompt Engineering", "Cloud","Git", "PyTorch", "Gemini"],
        "education": {
            "required_degree": "Bachelor",
            "degree_restriction": "minimum",
            "required_field": "Computer Science or related"
        },
        "experience_years": [4, 6],
        "description": "Build and train AI LLM models, collaborate with the team to deploy, write clean and scalable code. Strong experience with modern Python frameworks like numpy and pandas is required."
    },
    {
        "title": "Marketing Specialist",
        "company": "Creative Media Group",
        "skills": ["SEO", "Google Analytics", "Content Creation", "Social Media Marketing"],
        "education": {
            "required_degree": "Bachelor",
            "degree_restriction": "minimum",
            "required_field": "Marketing"
        },
        "experience_years": [1, 3],
        "description": "Assist in developing marketing strategies, create and optimize content for various platforms, monitor and analyze marketing data, and work closely with sales teams to enhance brand presence."
    },
    {
        "title": "Graphic Designer",
        "company": "Artify Studios",
        "skills": ["Adobe Photoshop", "Illustrator", "Creative Suite", "Typography"],
        "education": {
            "required_degree": "Bachelor",
            "degree_restriction": "minimum",
            "required_field": "Graphic Design"
        },
        "experience_years": [1, 5],
        "description": "Design visual concepts for web, print, and digital media. Work closely with clients and other creative teams to create engaging designs that meet project requirements."
    },
    {
        "title": "Customer Support Representative",
        "company": "QuickTech Solutions",
        "skills": ["Customer Service", "Problem-Solving", "Communication", "CRM Software"],
        "education": {
            "required_degree": "High School",
            "degree_restriction": "minimum",
            "required_field": ""
        },
        "experience_years": [0, 2],
        "description": "Provide excellent customer service via phone, email, and chat. Address customer inquiries, troubleshoot technical issues, and escalate concerns as necessary to ensure customer satisfaction."
    },
    {
        "title": "Project Manager",
        "company": "Global Enterprise Inc.",
        "skills": ["Project Management", "Budgeting", "Team Leadership", "Microsoft Office Suite"],
        "education": {
            "required_degree": "Bachelor",
            "degree_restriction": "minimum",
            "required_field": "Business Administration"
        },
        "experience_years": [3, 7],
        "description": "Oversee and manage various company projects from inception to completion. Ensure that projects are completed on time, within budget, and according to specifications. Coordinate between multiple teams and stakeholders."
    }
]



In [None]:
for job in job_list:
  job_json_string = json.dumps(job)
  job_embeddings = store.generate_embedding(job_json_string)
  job['embedding'] = job_embeddings

In [None]:
cv_json['skills']

['Python',
 'Scikit-Learn',
 'TensorFlow',
 'Keras',
 'PyTorch',
 'NLTK',
 'Hugging Face Transformers',
 'OpenCV',
 'FastAPI',
 'Flask',
 'Streamlit',
 'Pandas',
 'NumPy',
 'Matplotlib',
 'Plotly',
 'Seaborn',
 'PySpark',
 'OpenAI API',
 'REST APIs',
 'GraphQL',
 'Neo4j',
 'Docker',
 'GitHub Actions',
 'CI/CD Pipelines',
 'SQL',
 'NoSQL',
 'Vector Databases (Pinecone, Faiss, Chroma DB)',
 'Machine Learning',
 'Deep Learning',
 'Generative AI',
 'LLMs (GPT, Gemini, LLaMA, Falcon,DeepSeek)',
 'Natural Language Processing (NLP)',
 'Time Series Analysis',
 'Model Deployment',
 'Prompt Engineering',
 'LangChain',
 'RAG (Retrieval-Augmented Generation)',
 'LlamaIndex',
 'LangGraph',
 'PhiData',
 'LangServer',
 'AutoGen',
 'LangSmith',
 'AutoML',
 'AI-Driven Process Automation',
 'Predictive Modeling',
 'Statistical Analysis',
 'Big Data Technologies',
 'Data Visualization',
 'AWS (SageMaker, Lambda, Bedrock, EC2)',
 'Azure (Azure ML, Azure AI, App Services)',
 'GCP (Vertex AI, Cloud Function

In [None]:
recommender = JobRecommender()

In [None]:
recommender.calculate_experience_match(1, [5, None])

0.19999999999999996

In [None]:
recommender.match_cv_to_jobs(cv_data=cv_json, jobs = job_list)

{'required_degree': 'Bachelor', 'degree_restriction': 'minimum', 'required_field': 'Computer Science'}
{'required_degree': 'Bachelor', 'degree_restriction': 'minimum', 'required_field': 'Computer Science or related'}
{'required_degree': 'Bachelor', 'degree_restriction': 'minimum', 'required_field': 'Marketing'}
{'required_degree': 'Bachelor', 'degree_restriction': 'minimum', 'required_field': 'Graphic Design'}
{'required_degree': 'High School', 'degree_restriction': 'minimum', 'required_field': ''}
{'required_degree': 'Bachelor', 'degree_restriction': 'minimum', 'required_field': 'Business Administration'}


[{'job_title': 'AI Engineer',
  'company': 'TechNova Solutions',
  'match_score': 0.509,
  'matching_factors': {'skills_match': 0.286,
   'experience_match': 0.632,
   'education_match': 0.546,
   'semantic_similarity': 0.609},
  'matched_skills': ['prompt engineering', 'pytorch'],
  'missing_skills': ['chroma', 'llm', 'cloud', 'git', 'gemini'],
  'explanation': 'Potential match but may need development in 5 areas'},
 {'job_title': 'Project Manager',
  'company': 'Global Enterprise Inc.',
  'match_score': 0.41,
  'matching_factors': {'skills_match': 0.25,
   'experience_match': 0.827,
   'education_match': 0.511,
   'semantic_similarity': 0.355},
  'matched_skills': ['team leadership'],
  'missing_skills': ['project management',
   'budgeting',
   'microsoft office suite'],
  'explanation': 'Potential match but may need development in 3 areas'},
 {'job_title': 'Software Developer',
  'company': 'TechNova Solutions',
  'match_score': 0.277,
  'matching_factors': {'skills_match': 0.2,
  