In [1]:
from langchain_ollama import OllamaLLM
import json
import sys
import os
import pprint
import re
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
def check_project_root():
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
    if project_root not in sys.path:
        raise ImportError(f"Project root '{project_root}' not found in Python path")
    print(f"Project root '{project_root}' successfully added to Python path")

try:
    check_project_root()
except ImportError as e:

    print(f"Error: {e}")

Project root 'c:\Users\yq198\Desktop\DANNY_AI\CV_Agent' successfully added to Python path


In [3]:
llm_deepseek_model = "deepseek-r1:latest"
llm_qwen_model = "qwen3:latest"
llm_gemma3_model = "gemma3:270m"
# Initialize the Ollama model
llm_deepseek = OllamaLLM(model=llm_deepseek_model, temperature=0.5) 
llm_qwen = OllamaLLM(model=llm_qwen_model, temperature=0.5) 
llm_gemma3 = OllamaLLM(model=llm_gemma3_model, temperature=0.5) 

In [4]:
from src.document_ingestion.pdf_processor import PDFProcessor

processor = PDFProcessor()
docs = processor.process_pdf("../data/MLE_CV_2025.pdf")
cv_text = " ".join([doc.page_content for doc in docs])

In [5]:
print(cv_text)
print("-------")

AI Singapore – AI Apprentice | Jan 2024 - Oct 2024
Enhanced clinician efficiency by 15% through an end-to-end ML pipeline (20 data sources, 4 engineers) that enabled
prioritization of low-confidence daily predictions.
Eliminated the need for clinicians to manually extract symptoms from medical reports, enhancing efficiency by
collaborating on fine-tuning a biomedical BERT LLM that improved feature F1 score by 25%. collaborating on fine-tuning a biomedical BERT LLM that improved feature F1 score by 25%.
Developed a customized date aggregation module using the Least Squares method to handle irregularities in time-
series data, resulting in a 10% improvement in explainability boosting model F1-score.
Containerized the ML pipeline with Docker and integrated FastAPI for testing, ensuring OS-agnostic deployment and
robust validation across Windows, Linux, and Mac. robust validation across Windows, Linux, and Mac.
Mentored 8 junior apprentices in Neural Networks and NLP, fostering collaborati

In [6]:
cv_prompt = f"""
You are a CV parsing assistant. 

Extract the following fields from this CV text and return ONLY valid JSON inside a fenced code block (```json ... ```). 
Do not output anything else.
Do not come out with your own information, all the information must only be extracted from the source itself.

Schema:
```json
{{
  "summary": "As per below guidelines",
  "education": [
    {{
      "institution": "As per below guidelines",
      "degree": "As per below guidelines",
      "start_date": "mm/yyyy or yyyy",
      "end_date": "mm/yyyy or yyyy"
    }}
  ],
  "experience": [
    {{
      "job_title": "As per below guidelines",
      "company": "As per below guidelines (leave empty if freelance/self-employed)",
      "start_date": "mm/yyyy or yyyy or yyyy/mm",
      "end_date": "mm/yyyy or yyyy or yyyy/mm",
      "description": "As per below guidelines"
    }}
  ],
  "skills": ["As per below guidelines"],
  "projects": [
    {{
      "project_title": "As per below guidelines",
      "description": "As per below guidelines"
    }}
  ]
}}

Use the following guidelines:

1. "summary": include text corresponding to ["summary", "profile", "professional summary", "about me"]
2. "education": include text corresponding to ["education", "academic", "certification", "degrees"]
3. "experience": include text corresponding to ["experience", "work history", "employment", "career", "professional experience"]
  - If a role is freelance/self-employed
  - output the entire title in "job_title"
  - leave "company" as an empty string ""
  - otherwise "company" should always have a input
4. "skills": return full set of skills which includes text corresponding to ["skills", "technologies", "technical skills", "competencies", "tools, "Tools & Technologies", ...]
5. "projects": return full set of projects if any which includes text corresponding to ["projects", "project", "portfolio"]

Return fully populated JSON. Do not use ellipses `...`. If a section is missing, return [] or "".
    
    
Here is the CV text:

"{cv_text}"

"""

In [7]:
cv_parsed_text_gemma = llm_gemma3.invoke(cv_prompt)

# Inspect the parsed CV
pprint.pprint(cv_parsed_text_gemma)


('```json\n'
 '{\n'
 '  "summary": "As per below guidelines",\n'
 '  "education": [\n'
 '  {\n'
 '    "institution": "As per below guidelines",\n'
 '    "degree": "As per below guidelines",\n'
 '    "start_date": "mm/yyyy or yyyy",\n'
 '    "end_date": "mm/yyyy or yyyy"\n'
 '  },\n'
 '  {\n'
 '    "institution": "As per below guidelines",\n'
 '    "degree": "As per below guidelines",\n'
 '    "start_date": "mm/yyyy or yyyy",\n'
 '    "end_date": "mm/yyyy or yyyy"\n'
 '  },\n'
 '  {\n'
 '    "institution": "As per below guidelines",\n'
 '    "degree": "As per below guidelines",\n'
 '    "start_date": "mm/yyyy or yyyy",\n'
 '    "end_date": "mm/yyyy or yyyy"\n'
 '  },\n'
 '  {\n'
 '    "institution": "As per below guidelines",\n'
 '    "degree": "As per below guidelines",\n'
 '    "start_date": "mm/yyyy or yyyy",\n'
 '    "end_date": "mm/yyyy or yyyy"\n'
 '  },\n'
 '  {\n'
 '    "institution": "As per below guidelines",\n'
 '    "degree": "As per below guidelines",\n'
 '    "start_date":

In [8]:
cv_parsed_text_deepseek = llm_deepseek.invoke(cv_prompt)

# Inspect the parsed CV
pprint.pprint(cv_parsed_text_deepseek)


('<think>\n'
 "Okay, let's tackle this CV extraction problem step by step. The user wants "
 'me to extract specific fields from the given CV text and return them in a '
 'JSON format following their schema exactly.\n'
 '\n'
 'First, I need to understand all sections of the provided CV text that '
 'correspond to each field in the schema. For "summary", it seems there are '
 'multiple mentions like ["professional summary", "about me"] - specifically, '
 'phrases like "AI/ML Engineer..." appear at different points but essentially '
 "describe the person's profile.\n"
 '\n'
 "Looking at education: There's a clear section with University of London and "
 'Corporate Finance Institute mentioned. The degree is Bachelor’s Degree in '
 'Banking and Finance from University of London.\n'
 '\n'
 'For experience, several roles are listed:\n'
 '1. AI Singapore - Associate AI Engineer\n'
 '2. CriAT (Credit Risk Analytics Startup) - Product Analyst\n'
 '3. Various Top Tier Banks & Deloitte - Private 

In [9]:
def clean_text_into_json(raw_input):

    # 1. Remove <think> blocks if they exist
    cleaned_text = re.sub(r"<think>.*?</think>", "", raw_input, flags=re.DOTALL)

    # 2. Extract the JSON inside ```json ... ```
    match = re.search(r"```json\s*(\{.*?\})\s*```", cleaned_text, flags=re.DOTALL)
    if match:
        json_text = match.group(1)
    else:
        # fallback: extract from first { to last }
        start = cleaned_text.find("{")
        end = cleaned_text.rfind("}")
        json_text = cleaned_text[start:end+1]

    # 3. Load JSON
    try:
        cv_json = json.loads(json_text)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        print("Raw JSON text:")
        print(json_text)
        cv_json = None
        
    return cv_json


In [10]:
clean_text_into_json(cv_parsed_text_deepseek)

{'summary': 'AI/ML Engineer with a unique blend of expertise in Python, Deep Learning, LLMs (RAG), and MLOps, coupled with a strong background in financial compliance and product analytics. I leverage deep analytical rigor and practical experience to develop and deploy data-driven solutions, particularly in regulated environments, as demonstrated by a 15% improvement in clinical decision-making efficiency through end-to-end ML pipelines.',
 'education': [{'institution': 'University of London',
   'degree': 'Bachelor’s Degree in Banking and Finance'}],
 'experience': [{'job_title': 'AI/ML & Analytics: Associate AI Engineer (AI Singapore)',
   'company': '',
   'start_date': 'Jan 2024',
   'end_date': 'Oct 2024',
   'description': ''},
  {'job_title': 'RAG-Powered Chatbot for PDF Search, Network Security Phishing Project End to End Deployment (traffic classifier with F1-score), NLP Kindle Review Sentiment Classification (engineered pipeline and achieved accuracy)',
   'company': '',
   '

In [11]:
cv_parsed_text_qwen= llm_qwen.invoke(cv_prompt)

# Inspect the parsed CV
pprint.pprint(cv_parsed_text_qwen)


('<think>\n'
 "Okay, let me start by carefully reading through the user's query and the "
 'provided CV text. The user wants me to extract specific fields into a JSON '
 'structure based on the given schema and guidelines. \n'
 '\n'
 'First, I need to identify the sections in the CV text that correspond to '
 'each field. The summary is mentioned under "PROFESSIONAL SUMMARY" and the '
 'text there is the summary. \n'
 '\n'
 'For education, the CV mentions "University of London: Bachelor’s Degree in '
 'Banking and Finance". I need to extract the institution, degree, start and '
 "end dates. Wait, the dates aren't explicitly mentioned here. The user's "
 "schema requires start_date and end_date, but the CV text doesn't have those. "
 "Maybe there's a mistake here. Let me check again. The education section in "
 'the CV text is under "EDUCATION AND CERTIFICATIONS", but the only entry '
 "there is the University of London degree. There's no mention of start or end "
 "dates. So perhaps th

In [12]:
clean_text_into_json(cv_parsed_text_qwen)

{'summary': 'AI/ML Engineer with a unique blend of expertise in Python, Deep Learning, LLMs (RAG), and MLOps, coupled with a strong background in financial compliance and product analytics. I leverage deep analytical rigor and practical experience to develop and deploy data-driven solutions, particularly in regulated environments, as demonstrated by a 15% improvement in clinical decision-making efficiency through end-to-end ML pipelines.',
 'education': [{'institution': 'University of London',
   'degree': 'Bachelor’s Degree in Banking and Finance',
   'start_date': '',
   'end_date': ''}],
 'experience': [{'job_title': 'AI Apprentice',
   'company': 'AI Singapore',
   'start_date': 'Jan 2024',
   'end_date': 'Oct 2024',
   'description': 'Enhanced clinician efficiency by 15% through an end-to-end ML pipeline (20 data sources, 4 engineers) that enabled prioritization of low-confidence daily predictions. Eliminated the need for clinicians to manually extract symptoms from medical report

### Challenges faced
- Took alot of time crafting the right prompts for the LLMs to ingest the CV and output the approriate fields as JSON
- Tried several open source models, so far qwen seems to be the best based on json outputs
- While deepseek performs well, it always the output always shows its reasoning behind the scene, have to create a customised regex to remove the reasoning logic.
- Tried various temperature aside from default, at 0.5 qwen seems to be the best as well which managed to capture all the information

### Moving on to crafting a prompt for JD(we can start with linkedIn)

In [13]:

sample_linkedin_text= """About the job
About us 

RE-LIVE is a next-generation real estate insights platform focused on streamlining and modernizing the valuation process for multiple commercial and residential property types. Our mission is to enable access to property intelligence through clean UI, data integrations, and dynamic reporting, with a strong emphasis on usability and accuracy. This role will contribute directly to transforming how real estate is evaluated and reported.


Your Role

As our Machine Learning Engineer, you'll spearhead the development of predictive models for real estate pricing, incorporating a rich variety of structured and unstructured data sources. You’ll work across the full ML lifecycle—from data acquisition and preprocessing to modeling, evaluation, and front-end visualization.


Key Responsibilities

Develop and deploy machine learning models to predict real estate prices and investment potential.
Identify and transform relevant signals from heterogeneous datasets to support accurate property value predictions.
Conduct NLP-based analysis of textual data (e.g. user reviews, market reports, real estate articles) to enrich model inputs.
Collect, clean, merge, and manage large and diverse datasets from APIs, web scraping, public sources, and commercial databases.
Design and implement interactive dashboards to visualize trends, model predictions, and insights in a user-friendly manner.
Collaborate with valuation experts and product designers to integrate insights into our platform.


Qualifications

Bachelor's or Master's degree in Computer Science, Data Science, or related field.
Strong academic background or demonstrable track record of high-impact, self-driven work in data science or machine learning.
Strong proficiency in Python and data science libraries like Pandas, Scikit-learn, NumPy.
Experience with deep learning frameworks (e.g. PyTorch, TensorFlow) for regression and NLP tasks.
Hands-on experience building interactive dashboards using Dash, Plotly, or Streamlit.
Familiarity with geospatial data and tools like GeoPandas, Shapely, or Kepler.gl is a plus.
Bonus points for knowledge of PostgreSQL/PostGIS, Elasticsearch, or LLMs for contextual insights.


What We Offer

Opportunity to build and scale a product that will redefine real estate investing in Asia and beyond.
Flexible working hours.
Collaborative, innovation-driven environment with direct access to decision-makers.
Competitive compensation and performance incentives.
"""

In [14]:
jd_prompt = f"""
You are a Job Description (JD) parsing assistant. 

Extract the following fields from this JD text and return ONLY valid JSON inside a fenced code block (```json ... ```). 
Do not output anything else.

Schema:
```json
{{
  "job_title": "As per below guidelines",
  "department": "As per below guidelines (leave empty if not specified)",
  "location": "As per below guidelines (leave empty if not specified)",
  "employment_type": "As per below guidelines (Full-time, Part-time, Contract, etc.)",
  "experience_required": "As per below guidelines (years of experience or descriptive requirements)",
  "responsibilities": ["As per below guidelines"],
  "skills_required": ["As per below guidelines (programming languages, tools, frameworks, technologies)"],
  "qualifications": ["As per below guidelines (required degrees, certifications)"],
  "preferred_qualifications": ["As per below guidelines (optional or bonus skills)"],
  "other_notes": "As per below guidelines (benefits, company culture, perks, etc.)"
}}


Use the following guidelines:
1. job_title: Extract the main role being hired for, e.g., "Machine Learning Engineer".
2. department: Include the team or department if explicitly mentioned, otherwise leave as empty string "".
3. location: Extract physical or remote location if specified, otherwise leave empty.
4. employment_type: Identify if the role is Full-time, Part-time, Contract, Internship, etc.
5. experience_required: Include years of experience or descriptive requirements (e.g., "2+ years of experience in data science or machine learning").
6. responsibilities: Extract all key tasks, duties, or responsibilities mentioned under sections like "Responsibilities", "Key Responsibilities", "Role Overview". Each responsibility should be a separate string in the array.
7. skills_required: Extract all technical skills, programming languages, tools, frameworks, or technologies explicitly mentioned in the JD. Each skill should be a separate string.
8. qualifications: Include required degrees, certifications, or educational qualifications. Each should be a separate string.
9. preferred_qualifications: Include optional or bonus qualifications or skills. Each should be a separate string.
10. other_notes: Include any additional information like benefits, company culture, perks, flexible working arrangements, or miscellaneous notes not captured in other fields.

Return fully populated JSON. Do not use ellipses .... If a section is missing, return [] for arrays or "" for string fields.

Here is the JD text:

"{sample_linkedin_text}"

"""

In [None]:
jd_parsed_text_qwen= llm_qwen.invoke(jd_prompt)

In [None]:
clean_text_into_json(jd_parsed_text_qwen)

{'experience_required': 'Strong academic background or demonstrable track record of high-impact, self-driven work in data science or machine learning.',
 'qualifications': "Bachelor's or Master's degree in Computer Science, Data Science, or related field.",
 'skills_required': 'Strong proficiency in Python and data science libraries like Pandas, Scikit-learn, NumPy. Experience with deep learning frameworks (e.g. PyTorch, TensorFlow) for regression and NLP tasks. Hands-on experience building interactive dashboards using Dash, Plotly, or Streamlit.',
 'preferred_qualifications': 'Familiarity with geospatial data and tools like GeoPandas, Shapely, or Kepler.gl. Knowledge of PostgreSQL/PostGIS, Elasticsearch, or LLMs for contextual insights.'}