# Load the Dataset

In [1]:
import pdfplumber
import glob
import json
import re

In [2]:
pdf_resume_path = '../data/INFORMATION-TECHNOLOGY'
pdf_resume_files = glob.glob(f'{pdf_resume_path}/*.pdf')

pdf_resume_files

['../data/INFORMATION-TECHNOLOGY/18176523.pdf',
 '../data/INFORMATION-TECHNOLOGY/25857360.pdf',
 '../data/INFORMATION-TECHNOLOGY/39718499.pdf',
 '../data/INFORMATION-TECHNOLOGY/40018190.pdf',
 '../data/INFORMATION-TECHNOLOGY/31243710.pdf',
 '../data/INFORMATION-TECHNOLOGY/25990239.pdf',
 '../data/INFORMATION-TECHNOLOGY/52246737.pdf',
 '../data/INFORMATION-TECHNOLOGY/19201175.pdf',
 '../data/INFORMATION-TECHNOLOGY/36434348.pdf',
 '../data/INFORMATION-TECHNOLOGY/25207620.pdf',
 '../data/INFORMATION-TECHNOLOGY/12635195.pdf',
 '../data/INFORMATION-TECHNOLOGY/26746496.pdf',
 '../data/INFORMATION-TECHNOLOGY/23666211.pdf',
 '../data/INFORMATION-TECHNOLOGY/24038620.pdf',
 '../data/INFORMATION-TECHNOLOGY/29051656.pdf',
 '../data/INFORMATION-TECHNOLOGY/33381211.pdf',
 '../data/INFORMATION-TECHNOLOGY/10840430.pdf',
 '../data/INFORMATION-TECHNOLOGY/28697203.pdf',
 '../data/INFORMATION-TECHNOLOGY/38753827.pdf',
 '../data/INFORMATION-TECHNOLOGY/27372171.pdf',
 '../data/INFORMATION-TECHNOLOGY/2488910

In [3]:
# use 33241454.pdf as an example

pdf_33241454_path = '../data/INFORMATION-TECHNOLOGY/33241454.pdf'

pdf_33241454_text = ''
with pdfplumber.open(pdf_33241454_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            pdf_33241454_text += page_text + '\n'

print(pdf_33241454_text)

INFORMATION TECHNOLOGY SUPERVISOR
Summary
Seeking a position as an Information Technology Specialist. Over 5 years of information technology experience in the U.S. Army, including over 1
year of supervisory experience. Trained personnel in the set-up of IT equipment, ensuring all equipment is properly connected and functioning.
Regularly troubleshoot and install various IT equipment and systems. Accountable for the maintenance and inventory of over $1 million worth of IT
and other communications equipment with zero losses or damages. Possess a Security and Microsoft Certification and a Secret Security
Clearance.
Highlights
Excellent communication techniques
Complex problem solver
Manufacturing systems integration
Advanced critical thinking
Multidisciplinary exposure
SharePoint
Design instruction creation
Microsoft Excel, Project and Visio
Project management
Accomplishments
Army Achievement Medal for impacting the communications network and overall success of Key Resolve 13 in Yong San,

## LANGCHAIN

### Using the new LCEL Architecture from LangChain.
LangChain recommends using LCEL (LangChain Expression Language) over Chains. 

Now we create the retriever object, the responsible to return the data contained in the ChromaDB Database.

In [4]:
# !pip install huggingface_hub==0.26.0

In [5]:
# !pip install --upgrade huggingface_hub

In [6]:
from getpass import getpass
hf_key = getpass("Hugging Face Key: ")

In [7]:
!huggingface-cli login --token $hf_key

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `langchain` has been saved to /Users/ebengunadi/.cache/huggingface/stored_tokens
Your token has been saved to /Users/ebengunadi/.cache/huggingface/token
Login successful.
The current active token is: `langchain`


## Importing LangChain Libraries.

In [8]:
from langchain.llms import HuggingFacePipeline
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [9]:
from langchain import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain.llms import HuggingFacePipeline
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import torch
from torch import cuda

In [11]:
#In a MAC Silicon the device must be 'mps'
# device = torch.device('mps') #to use with MAC Silicon
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [12]:
device

'cpu'

##Load the Model .

In [None]:
#You can try with any llama model, but you will need more GPU and memory as you
#increase the size of the model.
# model_id = "meta-llama/Llama-3.2-1B-Instruct" # Agreed per the 3/11/25 meeting, but not powerful enough!
model_id = "meta-llama/Llama-3.2-3B-Instruct" 

In [14]:
# !pip install --upgrade transformers 

In [15]:
# !pip install accelerate

In [16]:
from accelerate import init_empty_weights

In [17]:
%%time

# begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=hf_key
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    device_map='auto',
    token=hf_key
)
model.eval()
print(f"Model loaded on {device}")


Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.32s/it]
Some parameters are on the meta device because they were offloaded to the disk.


Model loaded on cpu
CPU times: user 4.63 s, sys: 5.1 s, total: 9.73 s
Wall time: 13.4 s


In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          use_aut_token=hf_key)


In [60]:
# Set up the HuggingFace pipeline with your desired settings.
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256, # ideally would be 1024, but takes too long given the resource constraints
    temperature=0,         # Deterministic output
    top_p=1.0,             # Allow full probability distribution
    do_sample=False,       # Greedy decoding
    eos_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.0,  # Lowered if necessary
    return_full_text=False,
    device_map="auto"
)
hf_llm = HuggingFacePipeline(pipeline=pipe)

Device set to use mps


Test LLM 

In [61]:
%%time
test_prompt = f"""
In the provided resume text, please extract the "Summary" section exactly as it appears. Follow these rules carefully:

1. Identify the first occurrence of a line that contains the word "Summary" by itself (ignoring any leading or trailing whitespace and case).
2. Starting from the line immediately after the "Summary" header, capture all subsequent lines until you encounter a new section header.
3. A new section header is defined as either:
   - a line that is entirely in uppercase letters (ignoring punctuation), or
   - a line that exactly matches (ignoring case) one of these words: Highlights, Experience, Education, Skills, Accomplishments.
4. Preserve the original formatting (line breaks, etc.) of the extracted text.
5. If no "Summary" section is found, return exactly: "No summary section found."

### Resume Text:
{pdf_33241454_text}
"""

# Then run your LLM call:
test_response = hf_llm(test_prompt)
print(test_response)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


### Extracted Summary Section:
```
Seeking a position as an Information Technology Specialist. Over 5 years of information technology experience in the U.S. Army, including over 1
year of supervisory experience. Trained personnel in the set-up of IT equipment, ensuring all equipment is properly connected and functioning.
Regularly troubleshoot and install various IT equipment and systems. Accountable for the maintenance and inventory of over $1 million worth of IT
and other communications equipment with zero losses or damages. Possess a Security and Microsoft Certification and a Secret Security
Clearance.
```
CPU times: user 8.92 s, sys: 25.2 s, total: 34.1 s
Wall time: 1min 13s


In [62]:
# --- Optimized Extraction Prompts ---
# BASICS: Look for headings like "BASICS", "Personal Information", or "Contact Details"
basics_extraction_prompt = (
    "Extract the BASICS section (also known as 'personal information' or 'contact details') "
    "from the resume text below. Return exactly the raw text of the first block that clearly represents this section. "
    "Do not generate any new content or commentary. If the section is not found, return exactly 'No basics section found.'\n\n"
    "Resume Text:\n{text}"
)

# WORK: Look for headings such as "WORK", "Experience", "Professional Experience", or "Employment History"
work_extraction_prompt = (
    "Extract the WORK section (also known as 'experience', 'professional experience', or 'employment history') "
    "from the resume text below. Return exactly the raw text of the first block that clearly represents this section. "
    "Do not generate any new details or formatting.'\n\n"
    "Resume Text:\n{text}"
)

# EDUCATION: Include synonyms such as 'Education', 'Academic Background', 'Academic History', or 'Qualifications'
education_extraction_prompt = (
    "Extract the EDUCATION section (also known as 'academic background', 'academic history', or 'qualifications') "
    "from the resume text below. Return exactly the raw text of the first block that clearly represents this section. "
    "Do not generate any new content or commentary. If the section is not found, return exactly 'No education section found.'\n\n"
    "Resume Text:\n{text}"
)

# SKILLS: Include synonyms like 'Skills', 'Competencies', 'Expertise', 'Technical Skills', or 'Proficiencies'
skills_extraction_prompt = (
    "Extract the SKILLS section (also known as 'competencies', 'expertise', 'technical skills', or 'proficiencies') "
    "from the resume text below. Return exactly the raw text of the first block that clearly represents this section. "
    "Do not generate any new details or commentary. If the section is not found, return exactly 'No skills section found.'\n\n"
    "Resume Text:\n{text}"
)

# PROJECTS: Include synonyms like 'Projects', 'Portfolio', or 'Work Samples'
projects_extraction_prompt = (
    "Extract the PROJECTS section (also known as 'portfolio' or 'work samples') "
    "from the resume text below. Return exactly the raw text of the first block that clearly represents this section. "
    "Do not generate any new content or commentary. If the section is not found, return exactly 'No projects section found.'\n\n"
    "Resume Text:\n{text}"
)

In [63]:
# Create extraction chains.
basics_extraction_chain = (
    {"text": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(basics_extraction_prompt)
    | hf_llm
    | StrOutputParser()
)
work_extraction_chain = (
    {"text": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(work_extraction_prompt)
    | hf_llm
    | StrOutputParser()
)
education_extraction_chain = (
    {"text": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(education_extraction_prompt)
    | hf_llm
    | StrOutputParser()
)
skills_extraction_chain = (
    {"text": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(skills_extraction_prompt)
    | hf_llm
    | StrOutputParser()
)
projects_extraction_chain = (
    {"text": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(projects_extraction_prompt)
    | hf_llm
    | StrOutputParser()
)

In [64]:
%%time

basics_extracted = basics_extraction_chain.invoke({"text": pdf_33241454_text})
print(basics_extracted)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.





No basics section found.
CPU times: user 519 ms, sys: 1.67 s, total: 2.19 s
Wall time: 7.6 s


In [65]:
%%time

work_extracted = work_extraction_chain.invoke({"text": pdf_33241454_text})
print(work_extracted)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.





WORK SECTION
'Information Technology Supervisor 01/2011 to 05/2014 Company Name City, State\nSupervise up to 10 personnel at one time, delegating tasks, conducting performance evaluations and providing corrective counseling as\nnecessary.\nTrain personnel in the set-up and proper use of IT related equipment while adhering to all policies and procedures.\nResponsible for the inventory of over $1 million worth of network communications equipment.\nTasked by President of the United States to act as supervisor and maintain signal communications for Fort Bragg army base.\nInformation Technology Technician 01/2009 to 01/2011 Company Name City, State\nMaintained communications equipment in order to effectively relay confidential and secret information.\nUtilized electronic test equipment to troubleshoot malfunctioning communications equipment and complete repairs as necessary.\nRegularly set up and added computer systems to a communication network, installing operation systems, accessing s

In [66]:
%%time 

education_extracted = education_extraction_chain.invoke({"text": pdf_33241454_text})
print(education_extracted)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.





'Education\nCertification, Windows 7, Microsoft, Fort Bragg, NC, 2012\n*Certification, Security, Comptia, Yong San, Korea, 2012\n*Distinguished Graduate Certificate, Information Technology (Network Communications) Course 2009 U.S. Army City, State\nCertificate, IT Network and Cisco Routing, IT Field Services Branch 2009 City, State\nAssociate of Science : Radiography 2008 Northwest Florida State College City, State Radiography\nCertificate 2001 IT Tech Prep, Trumbull Career and Technical Center State\nDiploma 2001 Warren G. Harding City, State\nSkills\nArmy, Cisco, counseling, customer assistance, database, documentation, Information Technology, inventory, IP, LAN, Windows 7, Network,\npersonnel, policies, protocols, repairs, Routing, San, supervisor, test equipment, troubleshoot, WAN' 

No education section found.
CPU times: user 13.9 s, sys: 35.2 s, total: 49 s
Wall time: 1min 55s


In [67]:
%%time 

skills_extracted = skills_extraction_chain.invoke({"text": pdf_33241454_text})
print(skills_extracted)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.





No skills section found.
CPU times: user 596 ms, sys: 1.75 s, total: 2.34 s
Wall time: 7.86 s


In [68]:
%%time 

projects_extracted = projects_extraction_chain.invoke({"text": pdf_33241454_text})
print(projects_extracted)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.





No projects section found.
CPU times: user 625 ms, sys: 1.32 s, total: 1.95 s
Wall time: 6.26 s


In [82]:
# Define JSON schemas for conversion.
basics_schema = '''{
    "name": "",
    "label": "",
    "image": "",
    "email": "",
    "phone": "",
    "url": "",
    "summary": "",
    "location": {
        "address": "",
        "postalCode": "",
        "city": "",
        "countryCode": "",
        "region": ""
    },
    "profiles": [
        {
            "network": "",
            "username": "",
            "url": ""
        }
    ]
}'''

work_schema = '''[
    {
        "name": "",
        "location": "",
        "description": "",
        "position": "",
        "url": "",
        "startDate": "",
        "endDate": "",
        "summary": "",
        "highlights": []
    }
]'''

education_schema = '''[
    {
        "institution": "",
        "url": "",
        "area": "",
        "studyType": "",
        "startDate": "",
        "endDate": "",
        "score": "",
        "courses": []
    }
]'''

skills_schema = '''[
    {
        "name": "",
        "level": "",
        "keywords": []
    }
]'''

projects_schema = '''[
    {
        "name": "",
        "description": "",
        "highlights": [],
        "keywords": [],
        "startDate": "",
        "endDate": "",
        "url": "",
        "roles": [],
        "entity": "",
        "type": ""
    }
]'''

In [101]:
# --- Optimized Conversion Prompts ---
# These prompts convert the extracted raw text into the desired JSON format without any extra commentary.
basics_conversion_prompt = (
    "Convert the extracted text into a JSON object that strictly adheres to the provided JSON schema. "
    "Use only the information present in the text; for any field not mentioned, set its value to an empty string. "
    "Output only the JSON object with no extra commentary or code.\n\n"
    "Schema: {schema}\n\n"
    "Extracted Text:\n{text}"
)

work_conversion_prompt = (
    "Convert the extracted text into a JSON array that strictly adheres to the provided JSON schema. "
    "Use only the details provided in the text; for any field not mentioned, set its value to an empty string (or an empty array as required). "
    "Output only the JSON array with no extra commentary or code.\n\n"
    "Schema: {schema}\n\n"
    "Extracted Text:\n{text}"
)

education_conversion_prompt = (
    "Convert the extracted text into a JSON array that strictly adheres to the provided JSON schema. "
    "Use only the details provided in the text; for any field not mentioned, set its value to an empty string. "
    "Output only the JSON array with no extra commentary or code.\n\n"
    "Schema: {schema}\n\n"
    "Extracted Text:\n{text}\n\n"
    "Note: For the 'courses' field, just return an array of names like ['Course 1', 'Course 2']."
)

skills_conversion_prompt = (
    "Convert the extracted text into a JSON array that strictly adheres to the provided JSON schema. "
    "Use only the details provided in the text; for any field not mentioned, set its value to an empty string. "
    "Output only the JSON array with no extra commentary or code.\n\n"
    "Schema: {schema}\n\n"
    "Extracted Text:\n{text}"
)

projects_conversion_prompt = (
    "Convert the extracted text into a JSON array that strictly adheres to the provided JSON schema. "
    "Use only the details provided in the text; for any field not mentioned, set its value to an empty string. "
    "Output only the JSON array with no extra commentary or code.\n\n"
    "Schema: {schema}\n\n"
    "Extracted Text:\n{text}"
)

In [102]:
# Create conversion chains.
basics_conversion_chain = (
    {"text": RunnablePassthrough(), "schema": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(basics_conversion_prompt)
    | hf_llm
    | StrOutputParser()
)
work_conversion_chain = (
    {"text": RunnablePassthrough(), "schema": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(work_conversion_prompt)
    | hf_llm
    | StrOutputParser()
)
education_conversion_chain = (
    {"text": RunnablePassthrough(), "schema": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(education_conversion_prompt)
    | hf_llm
    | StrOutputParser()
)
skills_conversion_chain = (
    {"text": RunnablePassthrough(), "schema": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(skills_conversion_prompt)
    | hf_llm
    | StrOutputParser()
)
projects_conversion_chain = (
    {"text": RunnablePassthrough(), "schema": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(projects_conversion_prompt)
    | hf_llm
    | StrOutputParser()
)

In [85]:
%%time

# if the extracted text startsWith "No.*section found" is in the extracted text, just return the schema
# else run the conversion chain

if re.match(r"^No\s+\w+\s+section\s+found\.?", basics_extracted.strip(), re.IGNORECASE) is not None:
    basics_json_response = basics_schema
else:
    basics_json_response = basics_conversion_chain.invoke({"text": basics_extracted, "schema": basics_schema})

print(basics_json_response)

{
    "name": "",
    "label": "",
    "image": "",
    "email": "",
    "phone": "",
    "url": "",
    "summary": "",
    "location": {
        "address": "",
        "postalCode": "",
        "city": "",
        "countryCode": "",
        "region": ""
    },
    "profiles": [
        {
            "network": "",
            "username": "",
            "url": ""
        }
    ]
}
CPU times: user 389 µs, sys: 145 µs, total: 534 µs
Wall time: 513 µs


In [86]:
def extract_json_object(response: str) -> str:
    """
    Extracts the JSON substring from the LLM response.
    """
    match = re.search(r'(\{.*\})', response, re.DOTALL)
    if match:
        return match.group(1)
    return response


In [87]:
basics_json_parsed = json.loads(extract_json_object(basics_json_response))
basics_json_parsed


{'name': '',
 'label': '',
 'image': '',
 'email': '',
 'phone': '',
 'url': '',
 'summary': '',
 'location': {'address': '',
  'postalCode': '',
  'city': '',
  'countryCode': '',
  'region': ''},
 'profiles': [{'network': '', 'username': '', 'url': ''}]}

In [88]:
%%time

# if the extracted text startsWith "No.*section found" is in the extracted text, just return the schema
# else run the conversion chain

if re.match(r"^No\s+\w+\s+section\s+found\.?", work_extracted.strip(), re.IGNORECASE) is not None:
    work_json_response = work_schema
else:
    work_json_response = work_conversion_chain.invoke({"text": work_extracted, "schema": work_schema})

print(work_json_response)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


 

```json
[
  {
    "name": "Information Technology Supervisor",
    "location": "City, State",
    "description": "",
    "position": "Supervise up to 10 personnel at one time, delegating tasks, conducting performance evaluations and providing corrective counseling as necessary. Train personnel in the set-up and proper use of IT related equipment while adhering to all policies and procedures. Responsible for the inventory of over $1 million worth of network communications equipment. Tasked by President of the United States to act as supervisor and maintain signal communications for Fort Bragg army base.",
    "url": "",
    "startDate": "01/2011",
    "endDate": "05/2014",
    "summary": "",
    "highlights": []
  },
  {
    "name": "Information Technology Technician",
    "location": "City, State",
    "description": "Maintained communications equipment in order to effectively relay confidential and secret information. Utilized electronic test equipment to troubleshoot malfunctionin

In [89]:
def extract_first_json_array(response: str) -> str:
    """
    Extracts the JSON array substring from the LLM response.
    Due to the 256 max token limit, just return the first object in the array and add a closing bracket.
    """
    match = re.search(r'(\[.*?\})', response, re.DOTALL)
    if match:
        return match.group(1) + ']'
    return response
    

In [90]:
work_json_parsed = json.loads(extract_first_json_array(work_json_response))
work_json_parsed


[{'name': 'Information Technology Supervisor',
  'location': 'City, State',
  'description': '',
  'position': 'Supervise up to 10 personnel at one time, delegating tasks, conducting performance evaluations and providing corrective counseling as necessary. Train personnel in the set-up and proper use of IT related equipment while adhering to all policies and procedures. Responsible for the inventory of over $1 million worth of network communications equipment. Tasked by President of the United States to act as supervisor and maintain signal communications for Fort Bragg army base.',
  'url': '',
  'startDate': '01/2011',
  'endDate': '05/2014',
  'summary': '',
  'highlights': []}]

In [103]:
%%time

# if the extracted text startsWith "No.*section found" is in the extracted text, just return the schema
# else run the conversion chain

if re.match(r"^No\s+\w+\s+section\s+found\.?", education_extracted.strip(), re.IGNORECASE) is not None:
    education_json_response = education_schema
else:
    education_json_response = education_conversion_chain.invoke({"text": education_extracted, "schema": education_schema})

print(education_json_response)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


 Do not include the course numbers.

```json
[
  {
    "institution": "Microsoft",
    "url": "",
    "area": "",
    "studyType": "",
    "startDate": "",
    "endDate": "",
    "score": "",
    "courses": [
      "Information Technology (Network Communications)",
      "IT Network and Cisco Routing"
    ]
  },
  {
    "institution": "Comptia",
    "url": "",
    "area": "",
    "studyType": "",
    "startDate": "",
    "endDate": "",
    "score": "",
    "courses": [
      "Security"
    ]
  },
  {
    "institution": "U.S. Army",
    "url": "",
    "area": "",
    "studyType": "",
    "startDate": "",
    "endDate": "",
    "score": "",
    "courses": [
      "Information Technology (Network Communications)"
    ]
  },
  {
    "institution": "Northwest Florida State College",
    "url": "",
    "area": "",
    "studyType": "",
    "startDate": "",
    "endDate": "",
    "score": "",
    "courses": [
      "Radiography"
    ]
  },
  {
    "institution": "
CPU times: user 18.2 s, sys: 

In [104]:
education_json_parsed = json.loads(extract_first_json_array(education_json_response))
education_json_parsed


[{'institution': 'Microsoft',
  'url': '',
  'area': '',
  'studyType': '',
  'startDate': '',
  'endDate': '',
  'score': '',
  'courses': ['Information Technology (Network Communications)',
   'IT Network and Cisco Routing']}]

In [105]:
%%time

# if the extracted text startsWith "No.*section found" is in the extracted text, just return the schema
# else run the conversion chain

if re.match(r"^No\s+\w+\s+section\s+found\.?", skills_extracted.strip(), re.IGNORECASE) is not None:
    skills_json_response = skills_schema
else:
    skills_json_response = skills_conversion_chain.invoke({"text": skills_extracted, "schema": skills_schema})

print(skills_json_response)

[
    {
        "name": "",
        "level": "",
        "keywords": []
    }
]
CPU times: user 91 µs, sys: 122 µs, total: 213 µs
Wall time: 428 µs


In [106]:
skills_json_parsed = json.loads(extract_first_json_array(skills_json_response))
skills_json_parsed


[{'name': '', 'level': '', 'keywords': []}]

In [107]:
%%time

# if the extracted text startsWith "No.*section found" is in the extracted text, just return the schema
# else run the conversion chain

if re.match(r"^No\s+\w+\s+section\s+found\.?", projects_extracted.strip(), re.IGNORECASE) is not None:
    projects_json_response = projects_schema
else:
    projects_json_response = projects_conversion_chain.invoke({"text": projects_extracted, "schema": projects_schema})

print(projects_json_response)

[
    {
        "name": "",
        "description": "",
        "highlights": [],
        "keywords": [],
        "startDate": "",
        "endDate": "",
        "url": "",
        "roles": [],
        "entity": "",
        "type": ""
    }
]
CPU times: user 269 µs, sys: 187 µs, total: 456 µs
Wall time: 634 µs


In [108]:
projects_json_parsed = json.loads(extract_first_json_array(projects_json_response))
projects_json_parsed


[{'name': '',
  'description': '',
  'highlights': [],
  'keywords': [],
  'startDate': '',
  'endDate': '',
  'url': '',
  'roles': [],
  'entity': '',
  'type': ''}]

## Put together the final JSON

In [109]:
# put it all together

resume_json = {
    "$schema": "https://raw.githubusercontent.com/jsonresume/resume-schema/v1.0.0/schema.json",
    "basics": basics_json_parsed,
    "work": work_json_parsed,
    "education": education_json_parsed,
    "skills": skills_json_parsed,
    "projects": projects_json_parsed,
    "meta": {
        "canonical": "https://raw.githubusercontent.com/jsonresume/resume-schema/v1.0.0/sample.resume.json",
        "version": "v1.0.0",
        "lastModified": "2017-12-24T15:53:00"
    }
}

resume_json

{'$schema': 'https://raw.githubusercontent.com/jsonresume/resume-schema/v1.0.0/schema.json',
 'basics': {'name': '',
  'label': '',
  'image': '',
  'email': '',
  'phone': '',
  'url': '',
  'summary': '',
  'location': {'address': '',
   'postalCode': '',
   'city': '',
   'countryCode': '',
   'region': ''},
  'profiles': [{'network': '', 'username': '', 'url': ''}]},
 'work': [{'name': 'Information Technology Supervisor',
   'location': 'City, State',
   'description': '',
   'position': 'Supervise up to 10 personnel at one time, delegating tasks, conducting performance evaluations and providing corrective counseling as necessary. Train personnel in the set-up and proper use of IT related equipment while adhering to all policies and procedures. Responsible for the inventory of over $1 million worth of network communications equipment. Tasked by President of the United States to act as supervisor and maintain signal communications for Fort Bragg army base.',
   'url': '',
   'startD

In [111]:
output_file = '../data/INFORMATION-TECHNOLOGY-JSON/33241454.json'
with open(output_file, 'w') as f:
    json.dump(resume_json, f, indent=4)
    print(f"Resume JSON saved to {output_file}")

Resume JSON saved to ../data/INFORMATION-TECHNOLOGY-JSON/33241454.json


## Cleanup



In [None]:
# !pip install "huggingface_hub[cli]"

In [None]:
# cleanup huggingface LLMs

# run this command in the terminal
# huggingface-cli delete-cache