# Load the Dataset

In [1]:
import pdfplumber
import glob
import json
import re

In [2]:
pdf_path = '../data/Richard Hendriks.pdf' # as agreed in whatsapp

pdf_text = ''
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            pdf_text += page_text + '\n'

print(pdf_text)

4/19/25, 10:19 PM Richard Hendriks
richard.hendriks@mail.com
Richard Hendriks
http://richardhendricks.example.com
San Francisco, California
SUMMARY
Richard hails from Tulsa. He has earned degrees from the University of Oklahoma and Stanford. (Go Sooners and
Cardinal!) Before starting Pied Piper, he worked for Hooli as a part time software developer. While his work focuses on
applied information theory, mostly optimizing lossless compression schema of both the length-limited and adaptive
variants, his non-work interests range widely, everything from quantum computing to chaos theory. He could tell you about
it, but THAT would NOT be a “length-limited” conversation!
SKILLS AND TECH
Web Development: HTML, CSS, Javascript.
Compression: Mpeg, MP4, GIF.
EXPERIENCE
Pied Piper CEO/President Dec 2013 — Dec 2014
Build an algorithm for artist to detect if their music was violating copy right infringement laws
Successfully won Techcrunch Disrupt
Optimized an algorithm that holds the current world 

# Use regex to break the text into sections
## LLM can be used but would be too slow for the demo

In [3]:
# Extract the 'SUMMARY' section from the resume text by grabbing everything from the beginning up to 'SKILLS AND TECH'
basics_extracted = pdf_text.split('SKILLS AND TECH')[0]
print(basics_extracted)

4/19/25, 10:19 PM Richard Hendriks
richard.hendriks@mail.com
Richard Hendriks
http://richardhendricks.example.com
San Francisco, California
SUMMARY
Richard hails from Tulsa. He has earned degrees from the University of Oklahoma and Stanford. (Go Sooners and
Cardinal!) Before starting Pied Piper, he worked for Hooli as a part time software developer. While his work focuses on
applied information theory, mostly optimizing lossless compression schema of both the length-limited and adaptive
variants, his non-work interests range widely, everything from quantum computing to chaos theory. He could tell you about
it, but THAT would NOT be a “length-limited” conversation!



In [4]:
# Extract the 'SKILLS AND TECH' section from the resume text by grabbing everything from 'SKILLS AND TECH' to 'EXPERIENCE'
skills_extracted = pdf_text.split('EXPERIENCE')[0].split('SKILLS AND TECH')[1]
print(skills_extracted)


Web Development: HTML, CSS, Javascript.
Compression: Mpeg, MP4, GIF.



In [5]:
# Extract the 'EXPERIENCE' section from the resume text by grabbing everything from 'EXPERIENCE' to 'VOLUNTEERING'
work_extracted = pdf_text.split('VOLUNTEERING')[0].split('EXPERIENCE')[1]
print(work_extracted)


Pied Piper CEO/President Dec 2013 — Dec 2014
Build an algorithm for artist to detect if their music was violating copy right infringement laws
Successfully won Techcrunch Disrupt
Optimized an algorithm that holds the current world record for Weisman Scores



In [6]:
# Extract the 'PROJECTS' section from the resume text by grabbing everything from 'PROJECTS' to 'EDUCATION'
projects_extracted = pdf_text.split('EDUCATION')[0].split('PROJECTS')[1]
print(projects_extracted)


Miss Direction: Won award at AIHacks 2016. Built by all women team of newbie programmers. Using modern
technologies such as GoogleMaps, Chrome Extension and Javascript.



In [7]:
# Extract the 'EDUCATION' section from the resume text by grabbing everything from 'EDUCATION' to 'PUBLICATIONS'
education_extracted = pdf_text.split('PUBLICATIONS')[0].split('EDUCATION')[1]
print(education_extracted)


University of Oklahoma Jun 2011 — Jan 2014
Bachelor - Information Technology, GPA: 4.0



# Load the Model

In [8]:
!pip install --upgrade \
  langchain \
  langchain-openai \
  openai \
  pydantic==1.10.13

Collecting langchain
  Using cached langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.3.14-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain)
  Using cached langchain_core-0.3.54-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Using cached langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
INFO: pip is looking at multiple versions of langchain to determine which version is compatible with other requirements. This could take a while.
Collecting langchain
  Using cached langchain-0.3.22-py3-none-any.whl.metadata (7.8 kB)
  Using cached langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
  Using cached langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
  Using cached langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
  Using cached langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
  Using cached langchain-0.3.1

In [9]:
from langchain.chat_models import ChatOpenAI

In [10]:
from getpass import getpass
openai_key = getpass("OpenAI Key: ")

In [11]:
import os
os.environ["OPENAI_API_KEY"] = openai_key

In [12]:
llm = ChatOpenAI(temperature=0, model="gpt-4o") 

  llm = ChatOpenAI(temperature=0, model="gpt-4o")


## LANGCHAIN

### Using the new LCEL Architecture from LangChain.
LangChain recommends using LCEL (LangChain Expression Language) over Chains. 

## Importing LangChain Libraries.

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [14]:
# Define JSON schemas for conversion.
basics_schema = '''{
    "name": "",
    "label": "",
    "image": "",
    "email": "",
    "phone": "",
    "url": "",
    "summary": "",
    "location": {
        "address": "",
        "postalCode": "",
        "city": "",
        "countryCode": "",
        "region": ""
    },
    "profiles": [
        {
            "network": "",
            "username": "",
            "url": ""
        }
    ]
}'''

work_schema = '''[
    {
        "name": "",
        "location": "",
        "description": "",
        "position": "",
        "url": "",
        "startDate": "",
        "endDate": "",
        "summary": "",
        "highlights": []
    }
]'''

education_schema = '''[
    {
        "institution": "",
        "url": "",
        "area": "",
        "studyType": "",
        "startDate": "",
        "endDate": "",
        "score": "",
        "courses": []
    }
]'''

skills_schema = '''[
    {
        "name": "",
        "level": "",
        "keywords": []
    }
]'''

projects_schema = '''[
    {
        "name": "",
        "description": "",
        "highlights": [],
        "keywords": [],
        "startDate": "",
        "endDate": "",
        "url": "",
        "roles": [],
        "entity": "",
        "type": ""
    }
]'''

In [15]:
conversion_prompt = (
    "Convert the extracted text into a JSON object or array that strictly adheres to the provided JSON schema. "
    "Use only the information present in the text; for any field not mentioned, set its value to an empty string. "
    "Output only the JSON array with no extra commentary or code.\n"
    "### Schema:\n{schema}\n"
    "### Extracted Text:\n{text}"
)

In [16]:
conversion_chain = (
    {"text": RunnablePassthrough(), "schema": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(conversion_prompt)
    | llm
    | StrOutputParser()
)

### "Basics" Section

In [17]:
%%time

basics_json_response = conversion_chain.invoke({"text": basics_extracted, "schema": basics_schema})

print(basics_json_response)

```json
{
    "name": "Richard Hendriks",
    "label": "",
    "image": "",
    "email": "richard.hendriks@mail.com",
    "phone": "",
    "url": "http://richardhendricks.example.com",
    "summary": "Richard hails from Tulsa. He has earned degrees from the University of Oklahoma and Stanford. (Go Sooners and Cardinal!) Before starting Pied Piper, he worked for Hooli as a part time software developer. While his work focuses on applied information theory, mostly optimizing lossless compression schema of both the length-limited and adaptive variants, his non-work interests range widely, everything from quantum computing to chaos theory. He could tell you about it, but THAT would NOT be a “length-limited” conversation!",
    "location": {
        "address": "",
        "postalCode": "",
        "city": "San Francisco",
        "countryCode": "",
        "region": "California"
    },
    "profiles": [
        {
            "network": "",
            "username": "",
            "url": ""
  

In [18]:
def extract_json_object(response: str) -> str:
    """
    Extracts the JSON substring from the LLM response.
    """
    match = re.search(r'(\{.*\})', response, re.DOTALL)
    if match:
        return match.group(1)
    return response


In [19]:
basics_json_parsed = json.loads(extract_json_object(basics_json_response))
basics_json_parsed


{'name': 'Richard Hendriks',
 'label': '',
 'image': '',
 'email': 'richard.hendriks@mail.com',
 'phone': '',
 'url': 'http://richardhendricks.example.com',
 'summary': 'Richard hails from Tulsa. He has earned degrees from the University of Oklahoma and Stanford. (Go Sooners and Cardinal!) Before starting Pied Piper, he worked for Hooli as a part time software developer. While his work focuses on applied information theory, mostly optimizing lossless compression schema of both the length-limited and adaptive variants, his non-work interests range widely, everything from quantum computing to chaos theory. He could tell you about it, but THAT would NOT be a “length-limited” conversation!',
 'location': {'address': '',
  'postalCode': '',
  'city': 'San Francisco',
  'countryCode': '',
  'region': 'California'},
 'profiles': [{'network': '', 'username': '', 'url': ''}]}

### "Work" Section

In [20]:
%%time

work_json_response = conversion_chain.invoke({"text": work_extracted, "schema": work_schema})

print(work_json_response)

```json
[
    {
        "name": "Pied Piper",
        "location": "",
        "description": "",
        "position": "CEO/President",
        "url": "",
        "startDate": "Dec 2013",
        "endDate": "Dec 2014",
        "summary": "",
        "highlights": [
            "Build an algorithm for artist to detect if their music was violating copy right infringement laws",
            "Successfully won Techcrunch Disrupt",
            "Optimized an algorithm that holds the current world record for Weisman Scores"
        ]
    }
]
```
CPU times: user 15.7 ms, sys: 3.44 ms, total: 19.2 ms
Wall time: 1.92 s


In [21]:
def extract_json_array(response: str) -> str:
    """
    Extracts the JSON array substring from the LLM response.
    """
    match = re.search(r'(\[.*\])', response, re.DOTALL)
    if match:
        return match.group(1)
    return response

In [22]:
work_json_parsed = json.loads(extract_json_array(work_json_response))
work_json_parsed


[{'name': 'Pied Piper',
  'location': '',
  'description': '',
  'position': 'CEO/President',
  'url': '',
  'startDate': 'Dec 2013',
  'endDate': 'Dec 2014',
  'summary': '',
  'highlights': ['Build an algorithm for artist to detect if their music was violating copy right infringement laws',
   'Successfully won Techcrunch Disrupt',
   'Optimized an algorithm that holds the current world record for Weisman Scores']}]

### "Education" Section

In [23]:
%%time

education_json_response = conversion_chain.invoke({"text": education_extracted, "schema": education_schema})

print(education_json_response)

```json
[
    {
        "institution": "University of Oklahoma",
        "url": "",
        "area": "Information Technology",
        "studyType": "Bachelor",
        "startDate": "Jun 2011",
        "endDate": "Jan 2014",
        "score": "4.0",
        "courses": []
    }
]
```
CPU times: user 10.5 ms, sys: 3.02 ms, total: 13.5 ms
Wall time: 937 ms


In [24]:
education_json_parsed = json.loads(extract_json_array(education_json_response))
education_json_parsed


[{'institution': 'University of Oklahoma',
  'url': '',
  'area': 'Information Technology',
  'studyType': 'Bachelor',
  'startDate': 'Jun 2011',
  'endDate': 'Jan 2014',
  'score': '4.0',
  'courses': []}]

### "Skills" Section

In [25]:
%%time

skills_json_response = conversion_chain.invoke({"text": skills_extracted, "schema": skills_schema})

print(skills_json_response)

```json
[
    {
        "name": "Web Development",
        "level": "",
        "keywords": ["HTML", "CSS", "Javascript"]
    },
    {
        "name": "Compression",
        "level": "",
        "keywords": ["Mpeg", "MP4", "GIF"]
    }
]
```
CPU times: user 15.1 ms, sys: 3.51 ms, total: 18.6 ms
Wall time: 1.22 s


In [26]:
skills_json_parsed = json.loads(extract_json_array(skills_json_response))
skills_json_parsed


[{'name': 'Web Development',
  'level': '',
  'keywords': ['HTML', 'CSS', 'Javascript']},
 {'name': 'Compression', 'level': '', 'keywords': ['Mpeg', 'MP4', 'GIF']}]

### "Projects" Section

In [27]:
%%time

projects_json_response = conversion_chain.invoke({"text": projects_extracted, "schema": projects_schema})

print(projects_json_response)

```json
[
    {
        "name": "Miss Direction",
        "description": "",
        "highlights": [
            "Won award at AIHacks 2016",
            "Built by all women team of newbie programmers"
        ],
        "keywords": [
            "GoogleMaps",
            "Chrome Extension",
            "Javascript"
        ],
        "startDate": "",
        "endDate": "",
        "url": "",
        "roles": [],
        "entity": "",
        "type": ""
    }
]
```
CPU times: user 13.7 ms, sys: 2.79 ms, total: 16.5 ms
Wall time: 1.35 s


In [28]:
projects_json_parsed = json.loads(extract_json_array(projects_json_response))
projects_json_parsed


[{'name': 'Miss Direction',
  'description': '',
  'highlights': ['Won award at AIHacks 2016',
   'Built by all women team of newbie programmers'],
  'keywords': ['GoogleMaps', 'Chrome Extension', 'Javascript'],
  'startDate': '',
  'endDate': '',
  'url': '',
  'roles': [],
  'entity': '',
  'type': ''}]

## Put together the final JSON

In [29]:
# put it all together

resume_json = {
    "$schema": "https://raw.githubusercontent.com/jsonresume/resume-schema/v1.0.0/schema.json",
    "basics": basics_json_parsed,
    "work": work_json_parsed,
    "education": education_json_parsed,
    "skills": skills_json_parsed,
    "projects": projects_json_parsed,
    "meta": {
        "canonical": "https://raw.githubusercontent.com/jsonresume/resume-schema/v1.0.0/sample.resume.json",
        "version": "v1.0.0",
        "lastModified": "2017-12-24T15:53:00"
    }
}

resume_json

{'$schema': 'https://raw.githubusercontent.com/jsonresume/resume-schema/v1.0.0/schema.json',
 'basics': {'name': 'Richard Hendriks',
  'label': '',
  'image': '',
  'email': 'richard.hendriks@mail.com',
  'phone': '',
  'url': 'http://richardhendricks.example.com',
  'summary': 'Richard hails from Tulsa. He has earned degrees from the University of Oklahoma and Stanford. (Go Sooners and Cardinal!) Before starting Pied Piper, he worked for Hooli as a part time software developer. While his work focuses on applied information theory, mostly optimizing lossless compression schema of both the length-limited and adaptive variants, his non-work interests range widely, everything from quantum computing to chaos theory. He could tell you about it, but THAT would NOT be a “length-limited” conversation!',
  'location': {'address': '',
   'postalCode': '',
   'city': 'San Francisco',
   'countryCode': '',
   'region': 'California'},
  'profiles': [{'network': '', 'username': '', 'url': ''}]},
 'w

In [30]:
output_file = '../data/Richard Hendricks.json'
with open(output_file, 'w') as f:
    json.dump(resume_json, f, indent=4)
    print(f"Resume JSON saved to {output_file}")

Resume JSON saved to ../data/Richard Hendricks.json
