In [None]:
from pathlib import Path

# import typing as t
import typing_extensions as typing
from IPython.display import Markdown

from kedro.config import OmegaConfigLoader
from kedro.framework.project import settings
from google import genai

In [None]:
# configure credentials
conf_path = str(Path("..") / settings.CONF_SOURCE)
conf_loader = OmegaConfigLoader(conf_source=conf_path)
GOOGLE_API_KEY = conf_loader["credentials"]["google_api_credentials"]["key"]

### Purpose of this Notebook

The purpose of this notebook is to convert a resume from PDF format to JSON format, adhering to the JSON schema proposed by this page: [JSON Resume Schema](https://jsonresume.org/schema).

Load pdf Resume.

In [None]:
pdf_cv: str = catalog.load("pdf_resume")  # noqa: F821
assert isinstance(pdf_cv, str), "pdf_cv is not a string"

Markdown(pdf_cv)

Load Gemini Client.

In [None]:
client = genai.Client(api_key=GOOGLE_API_KEY)

Define the response schema for the resume, based on the JSON Resume Schema available at [https://jsonresume.org/schema](https://jsonresume.org/schema).

In [None]:
# Define TypedDicts for each section of the resume
class Location(typing.TypedDict, total=False):
    address: str
    postalCode: str
    city: str
    countryCode: str
    region: str


class Profile(typing.TypedDict, total=False):
    network: str
    username: str
    url: str


class Basics(typing.TypedDict, total=False):
    name: str
    label: str
    image: str
    email: str
    phone: str
    url: str
    summary: str
    location: Location
    profiles: list[Profile]


class WorkItem(typing.TypedDict, total=False):
    name: str
    position: str
    url: str
    startDate: str
    endDate: str
    summary: str
    highlights: list[str]


class VolunteerItem(typing.TypedDict, total=False):
    organization: str
    position: str
    url: str
    startDate: str
    endDate: str
    summary: str
    highlights: list[str]


class EducationItem(typing.TypedDict, total=False):
    institution: str
    url: str
    area: str
    studyType: str
    startDate: str
    endDate: str
    score: str
    courses: list[str]


class AwardItem(typing.TypedDict, total=False):
    title: str
    date: str
    awarder: str
    summary: str


class CertificateItem(typing.TypedDict, total=False):
    name: str
    date: str
    issuer: str
    url: str


class PublicationItem(typing.TypedDict, total=False):
    name: str
    publisher: str
    releaseDate: str
    url: str
    summary: str


class SkillItem(typing.TypedDict, total=False):
    name: str
    level: str
    keywords: list[str]


class LanguageItem(typing.TypedDict, total=False):
    language: str
    fluency: str


class InterestItem(typing.TypedDict, total=False):
    name: str
    keywords: list[str]


class ReferenceItem(typing.TypedDict, total=False):
    name: str
    reference: str


class ProjectItem(typing.TypedDict, total=False):
    name: str
    startDate: str
    endDate: str
    description: str
    highlights: list[str]
    url: str


# One TypedDict representing the full CV schema
class Curriculum(typing.TypedDict, total=False):
    basics: Basics
    work: list[WorkItem]
    volunteer: list[VolunteerItem]
    education: list[EducationItem]
    awards: list[AwardItem]
    certificates: list[CertificateItem]
    publications: list[PublicationItem]
    skills: list[SkillItem]
    languages: list[LanguageItem]
    interests: list[InterestItem]
    references: list[ReferenceItem]
    projects: list[ProjectItem]

Use the LLM client to transform the extracted CV text into the structured JSON schema

In [None]:
# First, create a system message to instruct the model
system_instruction = (
    "Extract information from the provided CV text and format it according to the JSON"
    " Resume schema. Return ONLY a valid JSON object without any markdown formatting,"
    " explanations, or additional text."
)

# Create a structured prompt
prompt_message = f"""
Given a CV text, extract all relevant information into the JSON Resume schema.
Fill in as many fields as possible based on the available information.
If information for a field is not available, use empty strings, arrays, or null values as appropriate.

CV Text:
{pdf_cv[:500]}
"""

# Combine instructions into one prompt
full_prompt = system_instruction + "\n\n" + prompt_message

# Generate the structured CV
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=full_prompt,
    config={
        "temperature": 0.1,
        "response_mime_type": "application/json",
        "response_schema": Curriculum,
    },
)

# Print the resulting JSON-formatted CV
response.text

Save json

In [None]:
import json

# Convertir a diccionario
json_dict = json.loads(response.text)

catalog.save("json_resume_from_pdf", json_dict)  # noqa: F821