In [26]:
import os
from enum import auto, Enum, StrEnum

from google import genai
from google.genai import types
from pydantic import BaseModel

from deep_statutes.config import STATUTES_DATA_DIR
from deep_statutes.states import ABBREV_TO_US_STATE

In [34]:
state = 'WY'

pdf_path = STATUTES_DATA_DIR / state.lower() / "pdf" / "title21.pdf"
api_key = os.getenv("GEMINI_API_KEY")

In [None]:
model_flash = "gemini-2.5-flash-preview-04-17"
model_pro = "gemini-2.5-pro-exp-03-25"

no_thinking = thinking_config=types.ThinkingConfig(thinking_budget=0)

In [None]:
client = genai.Client(api_key=api_key)

gem_doc = client.files.upload(file=pdf_path, display_name=pdf_path.name)

system_instruction = """
You are a legal assistant. You will be provided with a legal document.

You will be asked questions about the structure of the document.
How is the document structured hierarchically? Example header names for levels in this hierarchy are: "Title", "Chapter", "Section", "Article".
What are header names within the document?

Note that often sections are not labeled with the word "Section" but can be identified by their numbering, e.g. "21-1-102."
"""

cache = client.caches.create(
    model=model_flash,
    config=types.CreateCachedContentConfig(
        system_instruction=system_instruction,
        contents=[gem_doc],
    ),
)

In [5]:
cache

CachedContent(name='cachedContents/av3jvs7ze9re', display_name='', model='models/gemini-2.5-flash-preview-04-17', create_time=datetime.datetime(2025, 4, 25, 15, 33, 4, 51801, tzinfo=TzInfo(UTC)), update_time=datetime.datetime(2025, 4, 25, 15, 33, 4, 51801, tzinfo=TzInfo(UTC)), expire_time=datetime.datetime(2025, 4, 25, 16, 31, 56, 22155, tzinfo=TzInfo(UTC)), usage_metadata=CachedContentUsageMetadata(audio_duration_seconds=None, image_count=None, text_count=None, total_token_count=146358, video_duration_seconds=None))

In [8]:
prompt = """
Please provide a list of the header types in the document.

This list should be like:
['Title', 'Chapter', 'Article']
or
['Title', 'Article', 'Part']
where the first element is the highest level of the hierarchy and the last element is the lowest level.

Use this schema for the response:
Return: list[str]
"""

config = types.GenerateContentConfig(
    cached_content=cache.name,
    temperature=0.2,
    response_mime_type='application/json',
    response_schema=list[str],
)
response = client.models.generate_content(
    model=model_flash,
    config=config,
    contents=prompt,
)

print(response.text)

[
  "Title",
  "Chapter",
  "Article",
  "Section"
]


In [21]:
header_types = response.parsed
header_types = [header_type.capitalize() for header_type in header_types]

header_types

['Title', 'Chapter', 'Article', 'Section']

In [22]:
HeaderType = StrEnum("HeaderType", header_types[:-1])

class Header(BaseModel):
    type: HeaderType
    text: str
    sub_text: str 
    page: int

In [35]:
state_name = ABBREV_TO_US_STATE[state]

prompt = f"""
This document is a legal document containing a portion of laws of {state_name}.

It is structured hierarchically with header types: {header_types}.

Please make a table of contents for the document including every header and its page number.
"""

prompt += """The output should be a JSON array of objects with schema like:
[
    {
        "type": "title",
        "text": "TITLE 1",
        "sub_text": "GENERAL PROVISIONS",
        "page": 1
    },
    {
        "type": "article",
        "text": "ARTICLE 1",
        "sub_text": "Application of procedures act",
        "page": 2
    },
    {
        "type": "part",
        "text": "PART 1",
        "sub_text": "",
        "page": 3
    }
]
"""

config = types.GenerateContentConfig(
    cached_content=cache.name,
    temperature=0.2,
    response_mime_type='application/json',
    response_schema=list[Header],
)
response = client.models.generate_content(
    model=model_flash,
    config=config,
    contents=prompt,
)

print(response.text)

[
  {
    "type": "title",
    "text": "TITLE 21",
    "sub_text": "EDUCATION",
    "page": 1
  },
  {
    "type": "chapter",
    "text": "CHAPTER 1",
    "sub_text": "GENERAL PROVISIONS AND DEFINITIONS",
    "page": 1
  },
  {
    "type": "article",
    "text": "ARTICLE 1",
    "sub_text": "GENERAL PROVISIONS",
    "page": 1
  },
  {
    "type": "chapter",
    "text": "CHAPTER 2",
    "sub_text": "THE ADMINISTRATION OF THE STATE SYSTEM OF EDUCATION AT THE STATE LEVEL",
    "page": 1
  },
  {
    "type": "article",
    "text": "ARTICLE 1",
    "sub_text": "GENERAL PROVISIONS",
    "page": 1
  },
  {
    "type": "article",
    "text": "ARTICLE 2",
    "sub_text": "SUPERINTENDENT OF PUBLIC INSTRUCTION AND DEPARTMENT OF EDUCATION",
    "page": 2
  },
  {
    "type": "article",
    "text": "ARTICLE 3",
    "sub_text": "STATE BOARD OF EDUCATION",
    "page": 28
  },
  {
    "type": "article",
    "text": "ARTICLE 4",
    "sub_text": "PRIVATE SCHOOL LICENSING",
    "page": 37
  },
  {
    "t