In [2]:
### code from openAI notebook with different test and use case
import requests
from io import BytesIO
from pypdf import PdfReader
import re
import tiktoken
from nltk.tokenize import sent_tokenize
import nltk
from typing import List, Dict, Any

# Download nltk data if not already present
nltk.download('punkt_tab')

def load_document(url: str) -> str:
    """Load a document from a URL and return its text content."""
    print(f"Downloading document from {url}...")
    response = requests.get(url)
    response.raise_for_status()
    pdf_bytes = BytesIO(response.content)
    pdf_reader = PdfReader(pdf_bytes)
    
    full_text = ""
    

    max_page = 920  # Page cutoff before section 1000 (Interferences)
    for i, page in enumerate(pdf_reader.pages):
        if i >= max_page:
            break
        full_text += page.extract_text() + "\n"
    
    # Count words and tokens
    word_count = len(re.findall(r'\b\w+\b', full_text))
    
    tokenizer = tiktoken.get_encoding("o200k_base")
    token_count = len(tokenizer.encode(full_text))
    
    print(f"Document loaded: {len(pdf_reader.pages)} pages, {word_count} words, {token_count} tokens")
    return full_text



def read_document(url: str) -> str:
    """Load a document from a URL and return its text content."""
    print(f"read document from {url}...")
    #response = requests.get(url)
    #response.raise_for_status()
    #pdf_bytes = BytesIO(response.content)
    pdf_reader = PdfReader(url)
    
    full_text = ""
    

    max_page = 920  # Page cutoff before section 1000 (Interferences)
    for i, page in enumerate(pdf_reader.pages):
        if i >= max_page:
            break
        full_text += page.extract_text() + "\n"
    
    # Count words and tokens
    word_count = len(re.findall(r'\b\w+\b', full_text))
    
    tokenizer = tiktoken.get_encoding("o200k_base")
    token_count = len(tokenizer.encode(full_text))
    
    print(f"Document loaded: {len(pdf_reader.pages)} pages, {word_count} words, {token_count} tokens")
    return full_text



# Load the document
doc_dir= "10.36401_jipo-24-30.pdf"
#tbmp_url="hhtps://.pdf"
#document_text = load_document(tbmp_url)
document_text=read_document(doc_dir)
# Show the first 500 characters
print("\nDocument preview (first 500 chars):")
print("-" * 50)
print(document_text[:500])
print("-" * 50)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\difen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


read document from 10.36401_jipo-24-30.pdf...
Document loaded: 11 pages, 9020 words, 15725 tokens

Document preview (first 500 chars):
--------------------------------------------------
Review Article
4-1BB Antibodies in Oncology Clinical Trials:
AR e v i e w
Robin Schwartz,1 Keerti Vajrala,2 Gerald S. Falchook 1
1Sarah Cannon Research Institute at HealthONE, Denver, CO, USA
2Kansas City University College of Osteopathic Medicine, Kansas City, MO, USA
Address correspondence to Gerald Falchook (gerald.falchook@sarahcannon.com)
Sources of support: None. Conflicts of interest: Gerald S. Falchook reports research funding from Abbisko, ABL Bio, ADC
Therapeutics, Accutar, Agenus, Ail
--------------------------------------------------


In [20]:

def read_document(path: str, max_page: int = 920) -> str:
    """Load a document from a file path and return its text content up to max_page."""
    print(f"Reading document from {path}...")
    pdf_reader = PdfReader(path)

    full_text = ""
    for i, page in enumerate(pdf_reader.pages):
        if i >= max_page:
            break
        text = page.extract_text()
        if text:
            full_text += text + "\n"

    # Count words and tokens
    word_count = len(re.findall(r'\b\w+\b', full_text))
    tokenizer = tiktoken.get_encoding("o200k_base")
    token_count = len(tokenizer.encode(full_text))

    print(f"✅ {os.path.basename(path)}: {len(pdf_reader.pages)} pages, {word_count} words, {token_count} tokens")
    return full_text

def read_all_pdfs_in_folder(folder_path: str, max_page_per_doc: int = 920) -> str:
    """Read and combine all PDFs in a folder."""
    combined_text = ""
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(folder_path, filename)
            text = read_document(full_path, max_page=max_page_per_doc)
            combined_text += f"\n--- Document: {filename} ---\n" + text
    return combined_text

# Usage
folder = "input_kg"
document_text = read_all_pdfs_in_folder(folder)

Reading document from input_kg\10.36401_jipo-24-30.pdf...
✅ 10.36401_jipo-24-30.pdf: 11 pages, 9020 words, 15725 tokens
Reading document from input_kg\KMAB_15_2167189.pdf...
✅ KMAB_15_2167189.pdf: 22 pages, 18508 words, 34903 tokens
Reading document from input_kg\PIIS2405803324001420.pdf...
✅ PIIS2405803324001420.pdf: 27 pages, 19759 words, 34448 tokens
Reading document from input_kg\s12276-023-01136-4.pdf...
✅ s12276-023-01136-4.pdf: 8 pages, 9038 words, 16070 tokens


In [21]:
# Global tokenizer name to use consistently throughout the code
TOKENIZER_NAME = "o200k_base"

def split_into_20_chunks(text: str, min_tokens: int = 500) -> List[Dict[str, Any]]:
    """
    Split text into up to 20 chunks, respecting sentence boundaries and ensuring
    each chunk has at least min_tokens (unless it's the last chunk).
    
    Args:
        text: The text to split
        min_tokens: The minimum number of tokens per chunk (default: 500)
    
    Returns:
        A list of dictionaries where each dictionary has:
        - id: The chunk ID (0-19)
        - text: The chunk text content
    """
    # First, split the text into sentences
    sentences = sent_tokenize(text)
    
    # Get tokenizer for counting tokens
    tokenizer = tiktoken.get_encoding(TOKENIZER_NAME)
    
    # Create chunks that respect sentence boundaries and minimum token count
    chunks = []
    current_chunk_sentences = []
    current_chunk_tokens = 0
    
    for sentence in sentences:
        # Count tokens in this sentence
        sentence_tokens = len(tokenizer.encode(sentence))
        
        # If adding this sentence would make the chunk too large AND we already have the minimum tokens,
        # finalize the current chunk and start a new one
        if (current_chunk_tokens + sentence_tokens > min_tokens * 2) and current_chunk_tokens >= min_tokens:
            chunk_text = " ".join(current_chunk_sentences)
            chunks.append({
                "id": len(chunks),  # Integer ID instead of string
                "text": chunk_text
            })
            current_chunk_sentences = [sentence]
            current_chunk_tokens = sentence_tokens
        else:
            # Add this sentence to the current chunk
            current_chunk_sentences.append(sentence)
            current_chunk_tokens += sentence_tokens
    
    # Add the last chunk if there's anything left
    if current_chunk_sentences:
        chunk_text = " ".join(current_chunk_sentences)
        chunks.append({
            "id": len(chunks),  # Integer ID instead of string
            "text": chunk_text
        })
    
    # If we have more than 20 chunks, consolidate them
    if len(chunks) > 20:
        # Recombine all text
        all_text = " ".join(chunk["text"] for chunk in chunks)
        # Re-split into exactly 20 chunks, without minimum token requirement
        sentences = sent_tokenize(all_text)
        sentences_per_chunk = len(sentences) // 20 + (1 if len(sentences) % 20 > 0 else 0)
        
        chunks = []
        for i in range(0, len(sentences), sentences_per_chunk):
            # Get the sentences for this chunk
            chunk_sentences = sentences[i:i+sentences_per_chunk]
            # Join the sentences into a single text
            chunk_text = " ".join(chunk_sentences)
            # Create a chunk object with ID and text
            chunks.append({
                "id": len(chunks),  # Integer ID instead of string
                "text": chunk_text
            })
    
    # Print chunk statistics
    print(f"Split document into {len(chunks)} chunks")
    for i, chunk in enumerate(chunks):
        token_count = len(tokenizer.encode(chunk["text"]))
        print(f"Chunk {i}: {token_count} tokens")
    
    return chunks

# Split the document into 20 chunks with minimum token size
document_chunks = split_into_20_chunks(document_text, min_tokens=500)

Split document into 20 chunks
Chunk 0: 9091 tokens
Chunk 1: 5314 tokens
Chunk 2: 10063 tokens
Chunk 3: 8784 tokens
Chunk 4: 4679 tokens
Chunk 5: 3441 tokens
Chunk 6: 3515 tokens
Chunk 7: 3092 tokens
Chunk 8: 3253 tokens
Chunk 9: 7089 tokens
Chunk 10: 8981 tokens
Chunk 11: 5546 tokens
Chunk 12: 2869 tokens
Chunk 13: 3750 tokens
Chunk 14: 3576 tokens
Chunk 15: 3694 tokens
Chunk 16: 7372 tokens
Chunk 17: 2851 tokens
Chunk 18: 2097 tokens
Chunk 19: 2611 tokens


In [22]:
from openai import OpenAI
import os
import json
from typing import List, Dict, Any
API_KEY = ''
os.environ['OPENAI_API_KEY']=API_KEY

# Initialize OpenAI client
client = OpenAI()

def route_chunks(question: str, chunks: List[Dict[str, Any]], 
                depth: int, scratchpad: str = "") -> Dict[str, Any]:
    """
    Ask the model which chunks contain information relevant to the question.
    Maintains a scratchpad for the model's reasoning.
    Uses structured output for chunk selection and required tool calls for scratchpad.
    
    Args:
        question: The user's question
        chunks: List of chunks to evaluate
        depth: Current depth in the navigation hierarchy
        scratchpad: Current scratchpad content
    
    Returns:
        Dictionary with selected IDs and updated scratchpad
    """
    print(f"\n==== ROUTING AT DEPTH {depth} ====")
    print(f"Evaluating {len(chunks)} chunks for relevance")
    
    # Build system message
    system_message = """You are an expert document navigator. Your task is to:
1. Identify which text chunks might contain information to answer the user's question
2. Record your reasoning in a scratchpad for later reference
3. Choose chunks that are most likely relevant. Be selective, but thorough. Choose as many chunks as you need to answer the question, but avoid selecting too many.

First think carefully about what information would help answer the question, then evaluate each chunk.
"""

    # Build user message with chunks and current scratchpad
    user_message = f"QUESTION: {question}\n\n"
    
    if scratchpad:
        user_message += f"CURRENT SCRATCHPAD:\n{scratchpad}\n\n"
    
    user_message += "TEXT CHUNKS:\n\n"
    
    # Add each chunk to the message
    for chunk in chunks:
        user_message += f"CHUNK {chunk['id']}:\n{chunk['text']}\n\n"
    
    # Define function schema for scratchpad tool calling
    tools = [
        {
            "type": "function",
            "name": "update_scratchpad",
            "description": "Record your reasoning about why certain chunks were selected",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string",
                        "description": "Your reasoning about the chunk(s) selection"
                    }
                },
                "required": ["text"],
                "additionalProperties": False
            }
        }
    ]
    
    # Define JSON schema for structured output (selected chunks)
    text_format = {
        "format": {
            "type": "json_schema",
            "name": "selected_chunks",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "chunk_ids": {
                        "type": "array",
                        "items": {"type": "integer"},
                        "description": "IDs of the selected chunks that contain information to answer the question"
                    }
                },
                "required": [
                    "chunk_ids"
                ],
                "additionalProperties": False
            }
        }
    }
    
    # First pass: Call the model to update scratchpad (required tool call)
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message + "\n\nFirst, you must use the update_scratchpad function to record your reasoning."}
    ]
    
    response = client.responses.create(
        model="gpt-4.1-mini",
        input=messages,
        tools=tools,
        tool_choice="required"
    )
    
    # Process the scratchpad tool call
    new_scratchpad = scratchpad
    
    for tool_call in response.output:
        if tool_call.type == "function_call" and tool_call.name == "update_scratchpad":
            args = json.loads(tool_call.arguments)
            scratchpad_entry = f"DEPTH {depth} REASONING:\n{args.get('text', '')}"
            if new_scratchpad:
                new_scratchpad += "\n\n" + scratchpad_entry
            else:
                new_scratchpad = scratchpad_entry
            
            # Add function call and result to messages
            messages.append(tool_call)
            messages.append({
                "type": "function_call_output",
                "call_id": tool_call.call_id,
                "output": "Scratchpad updated successfully."
            })
    
    # Second pass: Get structured output for chunk selection
    messages.append({"role": "user", "content": "Now, select the chunks that could contain information to answer the question. Return a JSON object with the list of chunk IDs."})
    
    response_chunks = client.responses.create(
        model="gpt-4.1-mini",
        input=messages,
        text=text_format
    )
    
    # Extract selected chunk IDs from structured output
    selected_ids = []
    if response_chunks.output_text:
        try:
            # The output_text should already be in JSON format due to the schema
            chunk_data = json.loads(response_chunks.output_text)
            selected_ids = chunk_data.get("chunk_ids", [])
        except json.JSONDecodeError:
            print("Warning: Could not parse structured output as JSON")
    
    # Display results
    print(f"Selected chunks: {', '.join(str(id) for id in selected_ids)}")
    print(f"Updated scratchpad:\n{new_scratchpad}")
    
    return {
        "selected_ids": selected_ids,
        "scratchpad": new_scratchpad
    }

In [23]:
def navigate_to_paragraphs(document_text: str, question: str, max_depth: int = 1) -> Dict[str, Any]:
    """
    Navigate through the document hierarchy to find relevant paragraphs.
    
    Args:
        document_text: The full document text
        question: The user's question
        max_depth: Maximum depth to navigate before returning paragraphs (default: 1)
    
    Returns:
        Dictionary with selected paragraphs and final scratchpad
    """
    scratchpad = ""
    
    # Get initial chunks with min 500 tokens
    chunks = split_into_20_chunks(document_text, min_tokens=500)
    
    # Navigator state - track chunk paths to maintain hierarchy
    chunk_paths = {}  # Maps numeric IDs to path strings for display
    for chunk in chunks:
        chunk_paths[chunk["id"]] = str(chunk["id"])
    
    # Navigate through levels until max_depth or until no chunks remain
    for current_depth in range(max_depth + 1):
        # Call router to get relevant chunks
        result = route_chunks(question, chunks, current_depth, scratchpad)
        
        # Update scratchpad
        scratchpad = result["scratchpad"]
        
        # Get selected chunks
        selected_ids = result["selected_ids"]
        selected_chunks = [c for c in chunks if c["id"] in selected_ids]
        
        # If no chunks were selected, return empty result
        if not selected_chunks:
            print("\nNo relevant chunks found.")
            return {"paragraphs": [], "scratchpad": scratchpad}
        
        # If we've reached max_depth, return the selected chunks
        if current_depth == max_depth:
            print(f"\nReturning {len(selected_chunks)} relevant chunks at depth {current_depth}")
            
            # Update display IDs to show hierarchy
            for chunk in selected_chunks:
                chunk["display_id"] = chunk_paths[chunk["id"]]
                
            return {"paragraphs": selected_chunks, "scratchpad": scratchpad}
        
        # Prepare next level by splitting selected chunks further
        next_level_chunks = []
        next_chunk_id = 0  # Counter for new chunks
        
        for chunk in selected_chunks:
            # Split this chunk into smaller pieces
            sub_chunks = split_into_20_chunks(chunk["text"], min_tokens=200)
            
            # Update IDs and maintain path mapping
            for sub_chunk in sub_chunks:
                path = f"{chunk_paths[chunk['id']]}.{sub_chunk['id']}"
                sub_chunk["id"] = next_chunk_id
                chunk_paths[next_chunk_id] = path
                next_level_chunks.append(sub_chunk)
                next_chunk_id += 1
        
        # Update chunks for next iteration
        chunks = next_level_chunks

In [24]:
# Run the navigation for a sample question
question = '''What is the issue for 41bb agonist as a drug for cancer immunotherpay and how to overcome it from the antibody 
epitope design perspective and bispecific design and format, list actionable suggestions'''

navigation_result = navigate_to_paragraphs(document_text, question, max_depth=2)

# Sample retrieved paragraph
print("\n==== FIRST 3 RETRIEVED PARAGRAPHS ====")
for i, paragraph in enumerate(navigation_result["paragraphs"][:3]):
    display_id = paragraph.get("display_id", str(paragraph["id"]))
    print(f"\nPARAGRAPH {i+1} (ID: {display_id}):")
    print("-" * 40)
    print(paragraph["text"])
    print("-" * 40)

Split document into 20 chunks
Chunk 0: 9091 tokens
Chunk 1: 5314 tokens
Chunk 2: 10063 tokens
Chunk 3: 8784 tokens
Chunk 4: 4679 tokens
Chunk 5: 3441 tokens
Chunk 6: 3515 tokens
Chunk 7: 3092 tokens
Chunk 8: 3253 tokens
Chunk 9: 7089 tokens
Chunk 10: 8981 tokens
Chunk 11: 5546 tokens
Chunk 12: 2869 tokens
Chunk 13: 3750 tokens
Chunk 14: 3576 tokens
Chunk 15: 3694 tokens
Chunk 16: 7372 tokens
Chunk 17: 2851 tokens
Chunk 18: 2097 tokens
Chunk 19: 2611 tokens

==== ROUTING AT DEPTH 0 ====
Evaluating 20 chunks for relevance
Selected chunks: 0, 1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
Updated scratchpad:
DEPTH 0 REASONING:
The user asks about the issues with 4-1BB agonists as drugs for cancer immunotherapy, and how to overcome those issues from antibody epitope design, bispecific design, and format perspectives, including actionable suggestions.

Chunks 0 and 1 contain detailed clinical trial data on 4-1BB antibodies like urelumab and utomilumab, noting major issues like hepatotoxicit

In [25]:
from typing import List, Dict, Any
from pydantic import BaseModel, field_validator

class LegalAnswer(BaseModel):
    """Structured response format for legal questions"""
    answer: str
    citations: List[str]
    
    @field_validator('citations')
    def validate_citations(cls, citations, info):
        # Access valid_citations from the model_config
        valid_citations = info.data.get('_valid_citations', [])
        if valid_citations:
            for citation in citations:
                if citation not in valid_citations:
                    raise ValueError(f"Invalid citation: {citation}. Must be one of: {valid_citations}")
        return citations

def generate_answer(question: str, paragraphs: List[Dict[str, Any]], 
                   scratchpad: str) -> LegalAnswer:
    """Generate an answer from the retrieved paragraphs."""
    print("\n==== GENERATING ANSWER ====")
    
    # Extract valid citation IDs
    valid_citations = [str(p.get("display_id", str(p["id"]))) for p in paragraphs]
    
    if not paragraphs:
        return LegalAnswer(
            answer="I couldn't find relevant information to answer this question in the document.",
            citations=[],
            _valid_citations=[]
        )
    
    # Prepare context for the model
    context = ""
    for paragraph in paragraphs:
        display_id = paragraph.get("display_id", str(paragraph["id"]))
        context += f"PARAGRAPH {display_id}:\n{paragraph['text']}\n\n"
    
    system_prompt = """You are a research scientist answering questions about the how to design better immune therapy for cancer patietns.

Answer questions based ONLY on the provided paragraphs. Do not rely on any foundation knowledge or external information or extrapolate from the paragraphs.
Cite phrases of the paragraphs that are relevant to the answer. This will help you be more specific and accurate.
Include citations to paragraph IDs for every statement in your answer. Valid citation IDs are: {valid_citations_str}
Keep your answer clear, precise, and professional.
"""
    valid_citations_str = ", ".join(valid_citations)
    
    # Call the model using structured output
    response = client.responses.parse(
        model="gpt-4.1",
        input=[
            {"role": "system", "content": system_prompt.format(valid_citations_str=valid_citations_str)},
            {"role": "user", "content": f"QUESTION: {question}\n\nSCRATCHPAD (Navigation reasoning):\n{scratchpad}\n\nPARAGRAPHS:\n{context}"}
        ],
        text_format=LegalAnswer,
        temperature=0.3
    )
    
    # Add validation information after parsing
    response.output_parsed._valid_citations = valid_citations
    
    print(f"\nAnswer: {response.output_parsed.answer}")
    print(f"Citations: {response.output_parsed.citations}")

    return response.output_parsed

# Generate an answer
answer = generate_answer(question, navigation_result["paragraphs"], 
                       navigation_result["scratchpad"])


==== GENERATING ANSWER ====

Answer: The principal issue with 4-1BB agonists as cancer immunotherapy drugs is dose-limiting hepatotoxicity, especially with first-generation antibodies like urelumab, which caused fatal hepatitis, and limited efficacy at safe doses, as seen with utomilumab. This is due to systemic immune activation driven by both the antibody's epitope (urelumab binds membrane-distal CRD1, leading to strong systemic activation) and FcγR crosslinking, particularly in the liver. Utomilumab, which binds CRD3/4 and overlaps the ligand site, blocks 4-1BBL and fails to induce sufficient receptor clustering, resulting in poor efficacy at tolerated doses (0.0.2.7.0, 0.0.2.9.0, 0.0.2.10.0, 0.0.16.15.0).

To overcome these issues, several antibody epitope and bispecific design strategies have been developed:

**Epitope Design Suggestions:**
- Avoid targeting the membrane-distal CRD1 region (as urelumab does), which is associated with superagonistic activity and hepatotoxicity. In

In [26]:
cited_paragraphs = []
for paragraph in navigation_result["paragraphs"]:
    para_id = str(paragraph.get("display_id", str(paragraph["id"])))
    if para_id in answer.citations:
        cited_paragraphs.append(paragraph)
    

# Display the cited paragraphs for the audience
print("\n==== CITED PARAGRAPHS ====")
for i, paragraph in enumerate(cited_paragraphs):
    display_id = paragraph.get("display_id", str(paragraph["id"]))
    print(f"\nPARAGRAPH {i+1} (ID: {display_id}):")
    print("-" * 40)
    print(paragraph["text"])
    print("-" * 40)


==== CITED PARAGRAPHS ====

PARAGRAPH 1 (ID: 0.0.2.7.0):
----------------------------------------
The first generation of 4–1BB agonistic antibodies entering the clinic, urelumab 
(BMS-663513) and utomilumab (PF-05082566), failed due to (liver) toxicity or lack of efficacy, respectively. The two antibodies display differences in the affinity and the 4–1BB receptor epitope recognition, as well 
as the isotype, which determines the Fc-gamma-receptor (FcγR) crosslinking activity. Based on this 
experience a very diverse landscape of second-generation 4–1BB agonists addressing the liabilities of 
first-generation agonists has recently been developed, with many entering clinical Phase 1 and 2 studies. This review provides an overview focusing on differences and their scientific rationale, as well as 
challenges foreseen during the clinical development of these molecules. ARTICLE HISTORY 
Received 21 July 2022  
Revised 05 January 2023  
Accepted 06 January 2023 
KEYWORDS 
CD137; 4-1BB; TNF

In [27]:
from typing import List, Dict, Any, Literal
from pydantic import BaseModel

class VerificationResult(BaseModel):
    """Verification result format"""
    is_accurate: bool
    explanation: str
    confidence: Literal["high", "medium", "low"]

def verify_answer(question: str, answer: LegalAnswer, 
                 cited_paragraphs: List[Dict[str, Any]]) -> VerificationResult:
    """
    Verify if the answer is grounded in the cited paragraphs.
    
    Args:
        question: The user's question
        answer: The generated answer
        cited_paragraphs: Paragraphs cited in the answer
        
    Returns:
        Verification result with accuracy assessment, explanation, and confidence level
    """
    print("\n==== VERIFYING ANSWER ====")
    
    # Prepare context with the cited paragraphs
    context = ""
    for paragraph in cited_paragraphs:
        display_id = paragraph.get("display_id", str(paragraph["id"]))
        context += f"PARAGRAPH {display_id}:\n{paragraph['text']}\n\n"
    
    # Prepare system prompt
    system_prompt = """You are a fact-checker for legal information.
Your job is to verify if the provided answer:
1. Is factually accurate according to the source paragraphs
2. Uses citations correctly

Be critical and look for any factual errors or unsupported claims.
Assign a confidence level based on how directly the paragraphs answer the question:
- high: The answer is comprehensive, accurate, and directly supported by the paragraphs
- medium: The answer is mostly accurate but may be incomplete or have minor issues
- low: The answer has significant gaps, inaccuracies, or is poorly supported by the paragraphs
"""
    
    response = client.responses.parse(
        model="o4-mini",
        input=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"""
QUESTION: {question}

ANSWER TO VERIFY:
{answer.answer}

CITATIONS USED: {', '.join(answer.citations)}

SOURCE PARAGRAPHS:
{context}

Is this answer accurate and properly supported by the source paragraphs?
Assign a confidence level (high, medium, or low) based on completeness and accuracy.
            """}
        ],
        text_format=VerificationResult
    )
    
    # Log and return the verification result
    print(f"\nAccuracy verification: {'PASSED' if response.output_parsed.is_accurate else 'FAILED'}")
    print(f"Confidence: {response.output_parsed.confidence}")
    print(f"Explanation: {response.output_parsed.explanation}")
    
    return response.output_parsed

# Verify the answer using only the cited paragraphs
verification = verify_answer(question, answer, cited_paragraphs)

# Display final result with verification
print("\n==== FINAL VERIFIED ANSWER ====")
print(f"Verification: {'PASSED' if verification.is_accurate else 'FAILED'} | Confidence: {verification.confidence}")
print("\nAnswer:")
print(answer.answer)
print("\nCitations:")
for citation in answer.citations:
    print(f"- {citation}")


==== VERIFYING ANSWER ====

Accuracy verification: PASSED
Confidence: medium
Explanation: The answer correctly identifies the hepatotoxicity and limited efficacy issues of first-generation 4-1BB agonists (urelumab and utomilumab) and links these to FcγR-mediated crosslinking and epitope differences, all of which are directly supported by the cited paragraphs (0.0.2.7.0, 0.0.2.9.0, 0.0.2.10.0, 0.0.2.10.1, 0.0.16.15.0). The proposed epitope designs (targeting membrane-proximal CRDs, tuning affinity) and Fc-engineering strategies (Fc-silent or enhanced FcγRIIB binding) are likewise grounded in the sources (0.0.16.16.0, 0.0.16.17.0, 0.0.16.18.0, 0.0.17.0.0). Bispecific and multispecific formats, site-restricted activation, binding-site ratios, and format engineering recommendations are all supported by the cited paragraphs (0.0.2.12.0; 0.0.3.3.0; 0.0.3.6.0; 0.0.9.6.0; 0.0.9.8.0; 0.0.9.9.0; 0.0.9.12.0; 0.0.9.14.0; 0.0.9.15.0). One minor overextension is the mention of protease-cleavable ma