In [11]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,   
)

In [12]:
# Data ingestion via PyPDFLoader
print("PyPDFLoader")
try:
    py_pdf_loader = PyPDFLoader("data/MLE_CV_2025.pdf")
    py_pdf_docs = py_pdf_loader.load()
    print(py_pdf_docs)
    print(f" Loaded {len(py_pdf_docs)} pages from PDF document")
    print(f" Page 1 content first 100 tokens: {py_pdf_docs[0].page_content[:100]}")
    print(f" Meta data: {py_pdf_docs[0].metadata}")
except Exception as e:
    print(f"Error: {e}")

PyPDFLoader
Error: File path data/MLE_CV_2025.pdf is not a valid file or url


In [13]:
# Data ingestion via PyMuPDF Loader
print("PyMuPDFLoader")
try:
    py_mupdf_loader = PyMuPDFLoader("data/MLE_CV_2025.pdf")
    py_mupdf_loader = py_pdf_loader.load()
    print(py_mupdf_loader)
    print(f" Loaded {len(py_mupdf_loader)} pages from PDF document")
    print(f" Page 1 content first 100 tokens: {py_mupdf_loader[0].page_content[:100]}")
    print(f" Meta data: {py_mupdf_loader[0].metadata}")
except Exception as e:
    print(f"Error: {e}")

PyMuPDFLoader
Error: File path data/MLE_CV_2025.pdf is not a valid file or url


### Comparision between both loaders
- Not much differences in our use case, our PDF file is too small and highly structured
- PDF file contains mostly text data(non images)
- For PDF that are standardized PDFloader will suffice


In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
from langchain.docstore.document import Document

In [15]:
class PDFProcessor:
    """Advance PDF parsing"""
    def __init__(self, chunk_size=500, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=["\n\n", "\n", " ", ""])
    
    def process_pdf(self, pdf_path: str) -> List[Document]:
        """Process a PDF and return a list of chunked documents."""
        
        # Load the PDF
        loader = PyPDFLoader(pdf_path)
        
        # Load the document and split it into chunks in one step
        documents = loader.load_and_split(text_splitter=self.text_splitter)
        
        return documents

In [16]:
preprocessor = PDFProcessor()
pdf_document = preprocessor.process_pdf(pdf_path="../data/MLE_CV_2025.pdf")

In [17]:
pdf_document

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-06-23T11:08:34+00:00', 'title': 'MLE_CV_2025', 'moddate': '2025-06-23T11:08:33+00:00', 'keywords': 'DAGc8XXwA3s,BAFdjIIJdRo,0', 'author': 'yong quan', 'source': '../data/MLE_CV_2025.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='AI Singapore – AI Apprentice | Jan 2024 - Oct 2024\nEnhanced clinician efficiency by 15% through an end-to-end ML pipeline (20 data sources, 4 engineers) that enabled\nprioritization of low-confidence daily predictions.\nEliminated the need for clinicians to manually extract symptoms from medical reports, enhancing efficiency by\ncollaborating on fine-tuning a biomedical BERT LLM that improved feature F1 score by 25%.'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-06-23T11:08:34+00:00', 'title': 'MLE_CV_2025', 'moddate': '2025-06-23T11:08:33+00:00', 'keywords': 'DAGc8XXwA3s,BAFdjIIJdRo,0', 'author': 'yong quan', 'source

In [18]:
class CVParser:
    def __init__(self, text: str):
        self.text = text.lower()  # normalize for matching

    def parse(self) -> Dict:
        return {
            "summary": self.extract_section(["summary", "profile", "professional summary", "about me"]),
            "education": self.extract_section(["education", "academic", "academic background", "certification", "qualifications", "degrees", \
                "education and certification"]),
            "experience": self.extract_section(["experience", "work history", "employment", "career", "professional experience"]),
            "skills": self.extract_section(["skills", "technologies", "technical skills", "competencies"]),
            "projects": self.extract_section(["projects", "project", "portfolio", "achievements"]),
        }

    def extract_section(self, headers: list) -> str:
        """Return text under a section header until the next header"""
        lines = self.text.splitlines()
        section_lines = []
        current_section = None

        for line in lines:
            clean_line = line.strip()
            if not clean_line:
                continue

            # Check if the line matches a section header
            if any(h.lower() in clean_line for h in headers):
                current_section = True
                continue

            # Append lines under current section
            if current_section:
                # Stop if we hit another known section header
                if any(h.lower() in clean_line for h in sum([v for v in [headers]], [])):
                    break
                section_lines.append(line)

        return "\n".join(section_lines).strip()


In [19]:
processor = PDFProcessor()
docs = processor.process_pdf("../data/MLE_CV_2025.pdf")
full_text = " ".join([doc.page_content for doc in docs])
parser = CVParser(full_text)
metadata = parser.parse()


In [20]:
metadata

{'summary': 'ai/ml engineer with a unique blend of expertise in python, deep learning, llms (rag), and mlops, coupled with a strong\nbackground in financial compliance and product analytics. i leverage deep analytical rigor and practical experience to develop and deploy data-driven solutions, particularly in regulated environments, as demonstrated by a 15% improvement in clinical\ndecision-making efficiency through end-to-end ml pipelines.\nrag-powered chatbot for pdf search\ndesigned and deployed a rag llm chatbot on streamlit cloud, saving 70% of user time with instant pdf/web\nanswers, expanding search by 2x with arxiv, wikipedia, duckduckgo tooling. answers, expanding search by 2x with arxiv, wikipedia, duckduckgo tooling.\nutilized langchain, groq, pypdfloader, faiss vectorstore, and huggingface embeddings for efficient querying.\nnetwork security phishing project end to end deployment\nimplemented a traffic classifier with a 90% f1-score, which reduced analyst time by 50% for pri

In [21]:
from typing import Dict, List

class JDSectionExtractor:
    """Extract sections from a job description without regex"""

    def __init__(self, text: str):
        self.text = text

        # Section groups with flexible headers
        self.section_groups = {
            "company_info": ["about us", "company", "who we are"],
            "role": ["your role", "role", "position overview", "job purpose", "what you will be doing"],
            "responsibilities": ["key responsibilities", "responsibilities", "duties", "what you'll do"],
            "qualifications": ["qualifications", "requirements", "who you are", "skills"],
            "benefits": ["what we offer", "perks", "compensation", "why join us", "benefits"],
        }

    def extract_sections(self) -> Dict[str, str]:
        sections: Dict[str, List[str]] = {}
        current_section = None

        # Split JD into lines
        lines = self.text.splitlines()

        for line in lines:
            clean_line = line.strip()

            if not clean_line:  # skip empty lines
                continue

            # Check if the line matches any known section header
            lowered = clean_line.lower()
            found_section = None
            for section_name, headers in self.section_groups.items():
                if any(lowered.startswith(h) for h in headers):
                    found_section = section_name
                    break

            # If we hit a header, start a new section
            if found_section:
                current_section = found_section
                sections[current_section] = []
            elif current_section:
                # Append line under the current section
                sections[current_section].append(clean_line)

        # Join lines back into paragraphs
        return {k: "\n".join(v).strip() for k, v in sections.items()}


In [22]:
# Sample via LinkedIn

jd_text = """About the job
About us 

RE-LIVE is a next-generation real estate insights platform focused on streamlining and modernizing the valuation process for multiple commercial and residential property types. Our mission is to enable access to property intelligence through clean UI, data integrations, and dynamic reporting, with a strong emphasis on usability and accuracy. This role will contribute directly to transforming how real estate is evaluated and reported.


Your Role

As our Machine Learning Engineer, you'll spearhead the development of predictive models for real estate pricing, incorporating a rich variety of structured and unstructured data sources. You’ll work across the full ML lifecycle—from data acquisition and preprocessing to modeling, evaluation, and front-end visualization.


Key Responsibilities

Develop and deploy machine learning models to predict real estate prices and investment potential.
Identify and transform relevant signals from heterogeneous datasets to support accurate property value predictions.
Conduct NLP-based analysis of textual data (e.g. user reviews, market reports, real estate articles) to enrich model inputs.
Collect, clean, merge, and manage large and diverse datasets from APIs, web scraping, public sources, and commercial databases.
Design and implement interactive dashboards to visualize trends, model predictions, and insights in a user-friendly manner.
Collaborate with valuation experts and product designers to integrate insights into our platform.


Qualifications

Bachelor's or Master's degree in Computer Science, Data Science, or related field.
Strong academic background or demonstrable track record of high-impact, self-driven work in data science or machine learning.
Strong proficiency in Python and data science libraries like Pandas, Scikit-learn, NumPy.
Experience with deep learning frameworks (e.g. PyTorch, TensorFlow) for regression and NLP tasks.
Hands-on experience building interactive dashboards using Dash, Plotly, or Streamlit.
Familiarity with geospatial data and tools like GeoPandas, Shapely, or Kepler.gl is a plus.
Bonus points for knowledge of PostgreSQL/PostGIS, Elasticsearch, or LLMs for contextual insights.


What We Offer

Opportunity to build and scale a product that will redefine real estate investing in Asia and beyond.
Flexible working hours.
Collaborative, innovation-driven environment with direct access to decision-makers.
Competitive compensation and performance incentives."""

In [23]:
extractor = JDSectionExtractor(jd_text)
jd_sections = extractor.extract_sections()

for title, content in jd_sections.items():
    print(f"--- {title.upper()} ---\n{content}\n")

--- COMPANY_INFO ---
RE-LIVE is a next-generation real estate insights platform focused on streamlining and modernizing the valuation process for multiple commercial and residential property types. Our mission is to enable access to property intelligence through clean UI, data integrations, and dynamic reporting, with a strong emphasis on usability and accuracy. This role will contribute directly to transforming how real estate is evaluated and reported.

--- ROLE ---
As our Machine Learning Engineer, you'll spearhead the development of predictive models for real estate pricing, incorporating a rich variety of structured and unstructured data sources. You’ll work across the full ML lifecycle—from data acquisition and preprocessing to modeling, evaluation, and front-end visualization.

--- RESPONSIBILITIES ---
Develop and deploy machine learning models to predict real estate prices and investment potential.
Identify and transform relevant signals from heterogeneous datasets to support a

### Pros and Cons of parsers
#### Parsing of CV
- Given a structured CV, parsing will work great however if the CV has different style and canvas, manual parsing will bound to fail
- Move on to use LLM to parse the whole CV into a JSON output

#### Parsing of JD
- Works well as expected, as JD typcially only consists highly smiliar make up terms

#### Summary
- In this use case, CVS are typically 1-2 pages which will usually not hit token limit, therefore instead of chunking we can actually let LLM handles the full text in one prompt
- Have to carefully engineer good prompts to extract necessarily information headers