# This is Chapter exrtaction

In [None]:
import re
import json
import fitz  
FRONT_MATTER_STOPWORDS = {
    "contents", "table of contents", "copyright", "title page",
    "about the author", "dedication", "preface", "foreword", "acknowledgements",
}

def is_probable_chapter(title: str) -> bool:
    t = title.strip().lower()
    if t in FRONT_MATTER_STOPWORDS:
        return False

    if re.match(r"^\s*(chapter\s+\d+|[ivxlcdm]+\.)\b", t):  # "Chapter 1" or Roman numerals
        return True
    if re.match(r"^\s*\d+(\.\d+)*\b", t):  # "1", "1.2", "12.3.4"
        return True
    
    return len(title.strip()) >= 6

def load_toc(doc: fitz.Document):
    toc = doc.get_toc(simple=True)  # [[level, title, page], ...]
    clean = []
    for level, title, page in toc:
        if page is None or page < 1:
            continue  # skip missing/external links
        clean.append((level, title.strip(), page))
    return clean

def derive_chapter_ranges(doc: fitz.Document):
    """
    From full TOC -> list of canonical chapter entries with [start,end] page indices (0-based).
    Rule: chapter ends at the page before the next TOC item whose level <= this level.
    """
    toc = load_toc(doc)
    if not toc:
        return []

    level_counts = {}
    for level, title, _ in toc:
        if is_probable_chapter(title):
            level_counts[level] = level_counts.get(level, 0) + 1
    chapter_level = min(level_counts, key=lambda k: ( -level_counts[k], k )) if level_counts else 1

    filtered = [(lvl, title, page) for (lvl, title, page) in toc if lvl >= chapter_level]

    chapters = []
    for i, (lvl, title, page1_based) in enumerate(filtered):
        if lvl != chapter_level:
            continue
        start = max(0, page1_based - 1) 


        end_0based = doc.page_count - 1
        for j in range(i + 1, len(filtered)):
            lvl_j, _, page_j_1b = filtered[j]
            if lvl_j <= chapter_level:
                end_0based = max(0, page_j_1b - 2) 
                break

        if start <= end_0based:
            chapters.append({
                "level": lvl,
                "title": title,
                "start_page": start,
                "end_page": end_0based
            })

    chapters = [c for c in chapters if is_probable_chapter(c["title"])]
    return chapters

def extract_text_range(doc: fitz.Document, start_page: int, end_page: int) -> str:
    parts = []
    for p in range(start_page, end_page + 1):
        page = doc[p]
        parts.append(page.get_text("text", sort=True))
    return "\n".join(parts).strip()

# # ---- Example usage ----
# if __name__ == "__main__":
#     path = r"C:\Users\abiju\Desktop\Project-Velcro\REference_textbook\Charlie and the Chocolate Factory (Roald Dahl).pdf"
#     doc = fitz.open(path)
#     chapters = derive_chapter_ranges(doc)

#     for idx, ch in enumerate(chapters[:5]):
#         print(f"[{idx}] {ch['title']}  -> pages {ch['start_page']+1}–{ch['end_page']+1}")


# First pass JSON structure

```jsonc
{
  "chapter": [{
    "chapter_id": 0,                // Chapter number
    "title": "",                    // Chapter title
    "pages": [0, 0],                // [start_page, end_page]
    "summary_local": "",            // <= 160 words
    "characters": [
      {
        "name": "",                 // Character name
        "aliases": [],
        "status": "",               // Character status
        "chapter_role": "",         // Role in this chapter
        "character_actions": "",
        "relationships": [
          {
            "with_name": "",        // Other character's name
            "type": "",             // Relationship type
            "justification": ""     // <= 30 words
          }
        ]
      }
    ]
  } 
  ]
}


In [24]:
def make_chapter_skeletons(chapters):
    """
    chapters: list of dicts with keys title, start_page, end_page
    """
    out = []
    for chapter_id, ch in enumerate(chapters, start=1):
        out.append({
            "chapter_id": chapter_id,
            "title": ch["title"],
            "pages": [ch["start_page"], ch["end_page"]],
            "summary_local": "",
            "characters": [
                {
                    "name": "",
                    "aliases": [],
                    "status": "",
                    "chapter_role": "",
                    "character_actions": "",
                    "relationships": [
                        {"with_name": "", "type": "", "justification": ""}
                    ],
                }
            ],
        })
    return {"chapter": out}

# skeleton = make_chapter_skeletons(chapters)
# print(json.dumps(skeleton, indent=2))


In [None]:
from typing import List
from pydantic import BaseModel, Field
from openai import OpenAI
from openai import AsyncOpenAI, RateLimitError, APIError, APIConnectionError, InternalServerError
import asyncio

client = AsyncOpenAI()

class Relationship(BaseModel):
    with_name: str = Field(..., description="Other character's name")
    type: str = Field(..., description="Relationship type (e.g., ally | mentor | antagonist | family | rival | colleague | unknown)")
    justification: str = Field(..., description="<= 50 words")

class Character(BaseModel):
    name: str = Field(..., description="Name of the character")
    aliases: List[str] = Field(..., description="List of aliases for the character")
    status: str = Field(..., description="e.g: active | missing | dead | resolved | tentative")
    chapter_role: str = Field(..., description="e.g: POV | supporting | antagonist | cameo | unknown")
    character_actions: str = Field(..., description="Key actions or events involving the character in this chapter. Be descriptive and use short hand (<= 100 words)")
    relationships: List[Relationship]

class ChapterFill(BaseModel):
    summary_local: str = Field(..., description="summary of the chapter in <= 160 words")
    characters: List[Character]

chapter_fill_prompt = "You are a library assistant who is skilled at extracting structured information from story book chapters. You are given the text of a chapter and must fill in the structured data fields, such that it meets the following criteria:" \
"1. The summary_local field must contain a concise summary of the chapter in the context of the data provided. Even if you are aware about the story you are dealing with, do not add additional information that can potentially spoil the future chapters, limited to 160 words.\n" \
"2. For the characters entry, it is a list of Character objects, each with the following fields:\n" \
"   - name: Name of the character\n" \
"   - aliases: List of aliases for the character. Do not include generic pronouns such as 'he', 'she', or 'they' or role words such as teacher, guard , protagonist , antagonist etc\n" \
"   - status: e.g: active | missing | dead | resolved | tentative\n" \
"   - chapter_role: e.g: POV | supporting | antagonist | cameo | unknown\n" \
"   - character_actions: Key actions or events involving the character in this chapter (<= 100 words)\n" \
"   - relationships: List of Relationship objects\n" \
"3. For the relationships entry, it is a list of Relationship objects, each with the following fields:\n" \
"   - with_name: Other character's name\n" \
"   - type: Relationship type (e.g., ally | mentor | antagonist | family | rival | colleague | unknown)\n" \
"   - justification: an explanation in the context of the chapter why the relationship exists ( <= 50 words)\n"

# def fill_chapter_with_model(chapter_text: str) -> ChapterFill:
#     system = chapter_fill_prompt
#     resp = client.responses.parse(
#         model="gpt-5-mini",
#         input=[
#             {"role": "system", "content": system},
#             {"role": "user", "content": chapter_text},
#         ],
#         text_format=ChapterFill,
#     )
#     return resp.output_parsed

async def fill_chapter_with_model_async(client: AsyncOpenAI, chapter_text: str, *, semaphore: asyncio.Semaphore,
                                        max_retries: int = 5) -> ChapterFill:
    backoff = 1
    for attempt in range(max_retries):
        try:
            async with semaphore:
                resp = await client.responses.parse(
                    model="gpt-5-mini",
                    input=[
                        {"role": "system", "content": chapter_fill_prompt},
                        {"role": "user", "content": chapter_text},
                    ],
                    text_format=ChapterFill,
                )
            return resp.output_parsed
        except (RateLimitError, APIError, APIConnectionError, InternalServerError) as e:
            if attempt == max_retries - 1:
                raise
            await asyncio.sleep(backoff + random.random())
            backoff = min(backoff * 2, 30)


In [35]:
chapters

[{'level': 2,
  'title': '1 Here Comes Charlie',
  'start_page': 11,
  'end_page': 17},
 {'level': 2,
  'title': "2 Mr Willy Wonka's Factory",
  'start_page': 18,
  'end_page': 21},
 {'level': 2,
  'title': '3 Mr Wonka and the Indian Prince',
  'start_page': 22,
  'end_page': 24},
 {'level': 2,
  'title': '4 The Secret Workers',
  'start_page': 25,
  'end_page': 28},
 {'level': 2,
  'title': '5 The Golden Tickets',
  'start_page': 29,
  'end_page': 30},
 {'level': 2,
  'title': '6 The First Two Finders',
  'start_page': 31,
  'end_page': 35},
 {'level': 2,
  'title': "7 Charlie's Birthday",
  'start_page': 36,
  'end_page': 38},
 {'level': 2,
  'title': '8 Two More Golden Tickets Found',
  'start_page': 39,
  'end_page': 42},
 {'level': 2,
  'title': '9 Grandpa Joe Takes a Gamble',
  'start_page': 43,
  'end_page': 44},
 {'level': 2,
  'title': '10 The Family Begins to Starve',
  'start_page': 45,
  'end_page': 50},
 {'level': 2, 'title': '11 The Miracle', 'start_page': 51, 'end_page':

In [None]:
# def process_all_chapters(doc: fitz.Document, chapters: list) -> dict:
#     """
#     Processes all chapters, fills them with AI-generated data, and returns the final JSON object.
#     """
#     final_data = make_chapter_skeletons(chapters)
    
#     for i, chapter_info in enumerate(final_data["chapter"]):
#         print(f"Processing Chapter {chapter_info['chapter_id']}: '{chapter_info['title']}'...")
        
#         start_page, end_page = chapter_info["pages"]
#         chapter_text = extract_text_range(doc, start_page, end_page)
        
#         if not chapter_text.strip():
#             print(f"  -> No text found for chapter {chapter_info['chapter_id']}. Skipping.")
#             continue

#         # Get structured data from the AI model
#         filled_data = fill_chapter_with_model(chapter_text)
        
#         # Update the final JSON structure
#         final_data["chapter"][i]["summary_local"] = filled_data.summary_local
#         # Convert Pydantic character models to dictionaries for JSON serialization
#         final_data["chapter"][i]["characters"] = [char.model_dump() for char in filled_data.characters]

#     return final_data


async def process_all_chapters_async(doc: fitz.Document, chapters: list, max_concurrency: int = 4) -> dict:
    extracted = []
    for ch in chapters:
        text = extract_text_range(doc, ch["start_page"], ch["end_page"])
        extracted.append({"meta": ch, "text": text})

    final_data = make_chapter_skeletons(chapters)

    aclient = AsyncOpenAI()
    sem = asyncio.Semaphore(max_concurrency)

    tasks = []
    index_map = []  
    for i, item in enumerate(extracted):
        if not item["text"].strip():
            continue
        print(f"Processing Chapter {item['meta']['chapter_id']}...")
        tasks.append(asyncio.create_task(
            fill_chapter_with_model_async(aclient, item["text"], semaphore=sem)
        ))
        index_map.append(i)

    results = await asyncio.gather(*tasks, return_exceptions=True)

    for task_idx, result in enumerate(results):
        i = index_map[task_idx]
        if isinstance(result, Exception):
            # Leave defaults for this chapter (or you could add an "error" field)
            print(f"Chapter {final_data['chapter'][i]['chapter_id']} failed: {result}")
            continue

        final_data["chapter"][i]["summary_local"] = result.summary_local
        final_data["chapter"][i]["characters"] = [c.model_dump() for c in result.characters]

    return final_data

#### Look into parallel processing for chapter extraction , taking too long. ✅ (Done: 20 minutes to 4 minutes)

In [22]:
# pdf_path = r"C:\\Users\\abiju\\Desktop\\Project-Velcro\\REference_textbook\\Charlie and the Chocolate Factory (Roald Dahl).pdf"
# doc = fitz.open(pdf_path)

# chapters = derive_chapter_ranges(doc)
# if not chapters:
#     print("No chapters were derived from the PDF's table of contents.")
# else:
#     print("\n--- PROCESSING ALL CHAPTERS ---")
#     # final_output = process_all_chapters(doc, chapters)
#     final_output = asyncio.run(process_all_chapters_async(doc, chapters, max_concurrency=4))
#     output_filename = "final_story_output.json"
#     with open(output_filename, "w") as f:
#         json.dump(final_output, f, indent=2)
        
#     print(f"\nProcessing complete. The final JSON object has been saved to '{output_filename}'")


--- PROCESSING ALL CHAPTERS ---


RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
pdf_path = r"C:\Users\abiju\Desktop\Project-Velcro\REference_textbook\Charlie and the Chocolate Factory (Roald Dahl).pdf"
doc = fitz.open(pdf_path)

chapters = derive_chapter_ranges(doc)
if not chapters:
    print("No chapters were derived from the PDF's table of contents.")
else:
    print("\n--- PROCESSING ALL CHAPTERS ---")
    final_output = await process_all_chapters_async(doc, chapters, max_concurrency=4)

    output_filename = "final_story_output.json"
    with open(output_filename, "w") as f:
        json.dump(final_output, f, indent=2)

    print(f"\nProcessing complete. The final JSON object has been saved to '{output_filename}'")

doc.close()



--- PROCESSING ALL CHAPTERS ---

Processing complete. The final JSON object has been saved to 'final_story_output.json'


# Seconds pass : Now we connect between the chapters

```jsonc
{
  "chapter": [{
    "chapter_id": 0,                // Chapter number
    "title": "",                    // Chapter title
    "pages": [0, 0],                // [start_page, end_page]
    "summary_local": "",            // <= 160 words 
    "summary_global": "",           // concatenated summary across chapters ≤ 250 words
    "characters": [
      {
        "name": "",                 // Character name
        "aliases": [],              // Concatenated aliases across all the chapters
        "status": "",               // Update the character status as the story and relationships evolve
        "chapter_role": "",         // Current role in the chapter in the context of the ongoing story 
        "character_actions": "",  // Key actions or events involving the character as the story is progressing (<= 150 words)
        "relationships": [
          {
            "with_name": "",        // Other character's name
            "type": "",             // Updated Relationship type as the story progresses and the relationships evolve
            "justification": "",    // Updated in the context of the story <= 100 words
            "importance": 0         // Updated importance level (0-5)
          }
        ]
      }
    ]
  } 
  ]
}


In [39]:
from typing import List, Dict, Any

class RelationshipGlobal(BaseModel):
    with_name: str
    type: str
    justification: str = Field(..., description="<= 100 words")
    importance: int = Field(..., description="0-5 scale")

class CharacterGlobal(BaseModel):
    name: str
    aliases: List[str]
    status: str
    chapter_role: str
    character_actions: str = Field(..., description="Key actions or events involving the character in the ongoing story. Be descriptive and use short hand <= 200 words")
    relationships: List[RelationshipGlobal]

class ChapterGlobal(BaseModel):
    summary_global: str = Field(..., description="summary of the ongoing story <= 250 words")
    characters: List[CharacterGlobal]

In [40]:
# def merge_characters_locally(existing_chars: Dict[str, dict], new_chars: List[dict]) -> Dict[str, dict]:

#     for char in new_chars:
#         name = char['name']
        
#         matched_name = None
#         for alias in char.get('aliases', []):
#             for existing_name in existing_chars:
#                 if alias == existing_name or alias in existing_chars[existing_name].get('aliases', []):
#                     matched_name = existing_name
#                     break
#             if matched_name:
#                 break
        
#         key = matched_name or name
        
#         if key in existing_chars:
#             existing_aliases = set(existing_chars[key].get('aliases', []))
#             new_aliases = set(char.get('aliases', []))
#             existing_chars[key]['aliases'] = list(existing_aliases | new_aliases)
            
#             existing_chars[key]['status'] = char['status']
#             existing_chars[key]['chapter_role'] = char['chapter_role']
#             existing_chars[key]['character_actions'] = char['character_actions']
#             existing_chars[key]['relationships'] = char['relationships']
#         else:
#             # Add new character
#             existing_chars[key] = char.copy()
    
#     return existing_chars

In [41]:
chapter_fill_global = """
You are a library assistant who is skilled at extracting structured information from story book chapters. You are given the text of a chapter and must fill in the structured data fields, such that it meets the following criteria:
1. The summary_global field must contain a concise summary of the ongoing story in the context of the previous chapter summary and current chapter summary provided. Even if you are aware about the story you are dealing with, do not add additional information that can potentially spoil the future chapters, limited to 250 words.
2. The characters field must include all relevant characters introduced or developed in the chapter, along with their updated attributes. This includes :
   - Name: Name of the character
   - Aliases: List of character aliases
   - Status: Current status of the character in the context of the ongoing story with reference to previous chapters and current chapter
   - Chapter role: Role of the character in the chapter in the context of the ongoing story
   - Character actions: Key actions or events involving the character in the ongoing story
   - Relationships: List of relationships with other characters
3. For the relationships field, it is a list of Relationship objects, each with the following fields:
    - with_name: Other character's name
    - type: Relationship type (e.g., ally | mentor | antagonist | family | rival | colleague | unknown)
    - justification: an explanation in the context of the chapter why the relationship exists ( <= 100 words)
    - importance: Importance of the relationship on a scale of 0-5, where 0 is negligible and 5 is critical to the story. Update the importance level as the story progresses.
"""


In [42]:
async def create_global_view(
    client: AsyncOpenAI,
    previous_summary: str,
    previous_characters: dict,   # dict[str, dict] — your cumulative state
    current_chapter: dict        # from first pass
) -> ChapterGlobal:
    """
    Calls the LLM to get an intelligently merged global view of the story.
    Standardized to use the 'responses.parse' API for consistency.
    """
    context = {
        "previous_story_summary": previous_summary,
        "all_known_characters_so_far": list(previous_characters.values()),
        "current_chapter_title": current_chapter["title"],
        "current_chapter_summary": current_chapter["summary_local"],
        "characters_in_current_chapter": current_chapter["characters"],
    }

    resp = await client.responses.parse(
        model="gpt-5-mini",
        input=[
            {"role": "system", "content": chapter_fill_global},
            {"role": "user", "content": json.dumps(context, indent=2)}
        ],
        text_format=ChapterGlobal,
    )
    # Correctly returns the parsed Pydantic model
    return resp.output_parsed

In [None]:
def _normalize_and_validate_global(ch: ChapterGlobal) -> ChapterGlobal:
    """
    Enforces data invariants on the LLM's output to prevent drift and errors.
    This acts as a critical guardrail.
    """
    if not ch or not ch.characters:
        return ch

    for c in ch.characters:
        # Normalize and deduplicate aliases
        seen_aliases = set()
        unique_aliases = []
        for alias in (c.aliases or []):
            stripped_alias = (alias or "").strip()
            if stripped_alias and stripped_alias.lower() not in seen_aliases:
                seen_aliases.add(stripped_alias.lower())
                unique_aliases.append(stripped_alias)
        c.aliases = unique_aliases

        # Validate relationships
        if not c.relationships:
            continue
        for r in c.relationships: # Fixing edge case hallucination
            r.importance = max(0, min(5, int(r.importance or 0)))
            if r.justification and len(r.justification.split()) > 100:
                r.justification = " ".join(r.justification.split()[:100]) + "..."
    return ch


In [None]:
async def second_pass_processing(client: AsyncOpenAI, first_pass_data: dict) -> dict:
    chapters = first_pass_data["chapter"]
    final_output = {"chapter": []}

    # Initialize cumulative state
    cumulative_summary = ""
    cumulative_characters = {}   # {canonical_name: character_dict}
    alias_index = {}             # {normalized_alias: canonical_name}

    for i, chapter_data in enumerate(chapters, start=1):
        print(f"Second Pass - Processing chapter {i}/{len(chapters)}: {chapter_data['title']}")

        # 1. Get the intelligently merged view from the LLM
        global_view = await create_global_view(
            client=client,
            previous_summary=cumulative_summary,
            previous_characters=cumulative_characters,
            current_chapter=chapter_data
        )

        # 2. Apply guardrails: normalize and validate the LLM's output
        validated_global_view = _normalize_and_validate_global(global_view)

        # 3. Update the cumulative state using the *validated* data
        cumulative_summary = validated_global_view.summary_global
        
        # Reset and rebuild the character dictionary and alias index from the new ground truth
        cumulative_characters = {}
        alias_index = {}
        
        for char_model in validated_global_view.characters:
            char_dict = char_model.model_dump()
            canonical_name = char_model.name
            
            cumulative_characters[canonical_name] = char_dict
            
            # Update the alias index for robust tracking
            norm_canon_name = canonical_name.strip().casefold()
            alias_index[norm_canon_name] = canonical_name
            for alias in char_model.aliases:
                alias_index[alias.strip().casefold()] = canonical_name

        # 4. Append the state *as of this chapter* to the final output
        final_output["chapter"].append({
            "chapter_id": chapter_data["chapter_id"],
            "title": chapter_data["title"],
            "pages": chapter_data["pages"],
            "summary_local": chapter_data["summary_local"],
            "summary_global": cumulative_summary,
            "characters": list(cumulative_characters.values()),
        })

    return final_output

In [46]:
async def main():
    try:
        with open("final_story_output.json", "r") as f:
            first_pass_data = json.load(f)

        client = AsyncOpenAI()

        print("\n--- STARTING SECOND PASS PROCESSING ---")
        second_pass_output = await second_pass_processing(client, first_pass_data)

        output_filename = "story_global_view.json"
        with open(output_filename, "w") as f:
            json.dump(second_pass_output, f, indent=2)

        print(f"\nSecond pass complete! Saved to '{output_filename}'")

    except FileNotFoundError:
        print("Error: 'final_story_output.json' not found. Please run the first pass script first.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    await main()


--- STARTING SECOND PASS PROCESSING ---
Second Pass - Processing chapter 1/30: 1 Here Comes Charlie
Second Pass - Processing chapter 2/30: 2 Mr Willy Wonka's Factory
Second Pass - Processing chapter 3/30: 3 Mr Wonka and the Indian Prince
Second Pass - Processing chapter 4/30: 4 The Secret Workers
Second Pass - Processing chapter 5/30: 5 The Golden Tickets
Second Pass - Processing chapter 6/30: 6 The First Two Finders
Second Pass - Processing chapter 7/30: 7 Charlie's Birthday
Second Pass - Processing chapter 8/30: 8 Two More Golden Tickets Found
Second Pass - Processing chapter 9/30: 9 Grandpa Joe Takes a Gamble
Second Pass - Processing chapter 10/30: 10 The Family Begins to Starve
Second Pass - Processing chapter 11/30: 11 The Miracle
Second Pass - Processing chapter 12/30: 12 What It Said on the Golden Ticket
Second Pass - Processing chapter 13/30: 13 The Big Day Arrives
Second Pass - Processing chapter 14/30: 14 Mr Willy Wonka
Second Pass - Processing chapter 15/30: 15 The Chocolat

## TODO: Make a more efficient second pass: Fool around with sliding window

## Building RAG agent which will combine the JSON output we generated and use semantic search

#### So this is going to be a chat interface between the user and the RAG agent. RAG agent will take user queries, take meta data about the current page / status of the chapter the user is currently in, and then use that information to generate queries to the underlying document store. It should be capable of generating sub queries to get more granular information from the document store and then stitch together the appropriate reponse.

In [52]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
import getpass
import os

file_path = r"C:\Users\abiju\Desktop\Project-Velcro\REference_textbook\Chocolate.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

56


In [53]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

245

In [54]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [56]:
vector_store = InMemoryVectorStore(embeddings)
ids = vector_store.add_documents(documents=all_splits)

In [57]:
results = vector_store.similarity_search(
    "What did the Oompa Loompas sing when Gloop fell into the river?"
)

i =0
for r in results:
    print(i)
    i+=1
    print(r)
    print("*******************************************************************")

0
page_content='I'm so sorry. Good-bye, Mrs Gloop! And Mr Gloop! Good-bye! I'll see you later . . .' 
  As Mr and Mrs Gloop and their tiny escort  hurried away, the five Oompa-Loompas on the 
far side of the river suddenly began hopping and dancing about and beating wildly upon a 
number of very small drums. 'Augustus Gloop!' they chanted. 'Augustus Gloop! Augustus Gloop! 
Augustus Gloop!' 
'Grandpa!' cried Charlie. 'Listen to them, Grandpa! What are they doing?' 
  'Ssshh!' whispered Grandpa Joe. 'I th ink they're going to sing us a song!' 
  
'Augustus Gloop!' chanted the Oompa-Loompas. 
'Augustus Gloop! Augustus Gloop! 
The great big greedy nincompoop! 
How long could we allow this beast 
To gorge and guzzle, feed and feast 
On everything he wanted to? 
Great Scott! It simply wouldn't do! 
However long this pig might live, 
We're positive he'd never give 
Even the smallest bit of fun 
Or happiness to anyone. 
So what we do in cases such 
As this, we use the gentle touch,' metadata={

## Building the Agent


In [None]:
import os, getpass
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-5-mini", temperature=0)
from typing import List
from typing_extensions import TypedDict
from pydantic import BaseModel, Field
from langgraph.graph import MessagesState

class UserSearch(MessagesState):
    user_page_number: int # The current page the user is on
    context_object : Dict[str, Any] # The context object which contains the chapter information and relationships

def get_context(state: UserSearch) -> Dict[str, Any]:
    
    return state.context_object
