# Document Parse as a Tool w/ LLMs and Vector Database

In [1]:
import os
import json
import inspect
import requests
from io import BytesIO

import PyPDF2
from openai import OpenAI
from pydantic import BaseModel

from dotenv import load_dotenv

In [None]:
# Run this cell if you are using macos

import os
os.environ["PATH"] = "/opt/homebrew/bin/:" + os.environ["PATH"]

## Setting up the API Keys

Once `load_dotenv()` is called successfully, you will see `True` is returned and printed out. At this point, all the variables from `.env` file is loaded up as environment variable. Hence, you can access them with `os.getenv()` function.

In [2]:
load_dotenv()

True

Basically, we are going to need the following three API keys:
- `SERPER_API_KEY`: API key for Google Search API service from [Serper.dev](https://serper.dev/)
- `UPSTAGE_API_KEY`: API key for accessing [Upstage](https://www.upstage.ai/)'s Solar models.

In [3]:
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
UPSTAGE_API_KEY = os.getenv("UPSTAGE_API_KEY")

## Defining Helper Functions

Here, we are defining two helper functions necessary for tool(function) calling:
- `function_to_schema()`: Converts a Python function into a JSON schema format that can be used for function calling with LLMs. This can be thought as just structured string that is going to be injected into LLM as context so that LLM can understand what kind of functions are available to call
- `execute_tool_call()`: Executes the function call based on the LLM's response and returns the result

In [4]:
def function_to_schema(func) -> dict:
    """
    Converts a Python function into a JSON schema format for LLM function calling.
    
    Args:
        func: The Python function to convert to schema
        
    Returns:
        dict: JSON schema describing the function's interface
        
    The schema includes:
    - Function name
    - Description from docstring
    - Parameters with their types
    - Required parameters list
    """
    # Map Python types to JSON schema types
    type_map = {
        str: "string",
        int: "integer", 
        float: "number",
        bool: "boolean",
        list: "array",
        dict: "object",
        type(None): "null",
    }

    try:
        # Get function signature using inspect
        signature = inspect.signature(func)
    except ValueError as e:
        raise ValueError(
            f"Failed to get signature for function {func.__name__}: {str(e)}"
        )

    # Build parameters dictionary
    parameters = {}
    for param in signature.parameters.values():
        try:
            # Get JSON type for parameter, default to string if type not found
            param_type = type_map.get(param.annotation, "string")
        except KeyError as e:
            raise KeyError(
                f"Unknown type annotation {param.annotation} for parameter {param.name}: {str(e)}"
            )
        parameters[param.name] = {"type": param_type}

    # Get list of required parameters (those without default values)
    required = [
        param.name
        for param in signature.parameters.values()
        if param.default == inspect._empty
    ]

    # Return complete schema
    return {
        "type": "function",
        "function": {
            "name": func.__name__,
            "description": (func.__doc__ or "").strip(),
            "parameters": {
                "type": "object",
                "properties": parameters,
                "required": required,
            },
        },
    }
    
def execute_tool_call(tool_call, tools, agent_name):
    """
    Executes a function call based on the LLM's response.
    
    Args:
        tool_call: Object containing function call details from LLM
        tools: Dictionary mapping function names to actual functions
        agent_name: Name of the agent making the call, for logging
        
    Returns:
        The result of executing the specified function with given arguments
        
    This function:
    1. Extracts function name and arguments from tool_call
    2. Logs the function call
    3. Executes the function with provided arguments
    """
    # Extract function name and parse arguments from JSON
    name = tool_call.function.name
    args = json.loads(tool_call.function.arguments)

    # Log the function call
    print(f"{agent_name}:", f"{name}({args})")

    # Execute the function with unpacked arguments
    return tools[name](**args)  # call corresponding function with provided arguments

In [None]:
def truncate_tokens_if_needed(tokenizer, messages, content, max_token_limit=32000):
    """
    Truncate the markdown content if the total tokens exceed the maximum limit.
    
    Args:
        tokenizer: The tokenizer to use for encoding/decoding
        messages: List of message dictionaries for the conversation
        content: The markdown content to potentially truncate
        max_token_limit: Maximum token limit (default: 32000)
        
    Returns:
        truncated_markdown: The potentially truncated markdown
        base_token_numbers: Number of tokens in the base conversation
        paper_token_numbers: Number of tokens in the paper after potential truncation
    """
    inputs = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": supervisor_agent.instructions}
        ] + messages
    )
    base_token_numbers = len(inputs)
    encoded_content = tokenizer.encode(content)
    paper_token_numbers = len(encoded_content)

    print(f"Base token numbers: {base_token_numbers}")
    print(f"Paper token numbers: {paper_token_numbers}")
    print(f"Total token numbers: {base_token_numbers + paper_token_numbers}")

    total_token_numbers = base_token_numbers + paper_token_numbers

    if total_token_numbers > max_token_limit:
        # Calculate how many tokens we need to truncate
        tokens_to_keep = max_token_limit - base_token_numbers
        # Truncate the encoded markdown
        encoded_content = encoded_content[:tokens_to_keep]
        # Update the paper token count
        paper_token_numbers = len(encoded_content)
        # Update the markdown string by decoding the truncated tokens
        truncated_content = tokenizer.decode(encoded_content, skip_special_tokens=True)
        print(f"Truncated paper tokens to: {paper_token_numbers}")
    else:
        print("No truncation needed")
        truncated_content = content

    print(f"Total token numbers: {base_token_numbers + paper_token_numbers}")
    return truncated_content, base_token_numbers, paper_token_numbers

## Filesystem based Caching

In [37]:
import json
import shutil
import requests
from PyPDF2 import PdfReader, PdfWriter

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("upstage/solar-pro-preview-instruct")
message_template = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper below, \n"
    },
]

def to_paper_search_agent(paper_title: str):
    """Use this to search for paper URL on arXiv only when paper URL is not found yet."""
    url = "https://google.serper.dev/search"

    payload = json.dumps({"q": f"{paper_title} on arXiv"})
    headers = {
        'X-API-KEY': SERPER_API_KEY,
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    search_results = response.json()['organic']
    
    if len(search_results) == 0:
        return "Count not find the URL to download the paper"
    
    first_result = search_results[0]
    if not first_result['link'].startswith("https://arxiv.org"):
        return "Could not find the URL to download the paper"
    
    return f"URL to download '{paper_title}': {first_result['link'].replace('abs', 'pdf')}"

def split_pdf_by_pages(input_pdf_path, root_path, pages_per_pdf=10):
    # Open the PDF
    pdf = PdfReader(input_pdf_path)
    total_pages = len(pdf.pages)
    
    # Calculate number of output PDFs needed
    num_pdfs = (total_pages + pages_per_pdf - 1) // pages_per_pdf
    
    output_paths = []
    
    # Split into multiple PDFs
    for i in range(num_pdfs):
        writer = PdfWriter()
        
        # Calculate start and end pages for this split
        start_page = i * pages_per_pdf
        end_page = min((i + 1) * pages_per_pdf, total_pages)
        
        # Add pages to writer
        for page_num in range(start_page, end_page):
            writer.add_page(pdf.pages[page_num])
            
        # Save the split PDF
        output_path = f"{root_path}/{i+1}.pdf"
        with open(output_path, "wb") as output_file:
            writer.write(output_file)
        output_paths.append(output_path)
        
    return output_paths

def get_document_parse_response(filename, api_key):
    url = "https://api.upstage.ai/v1/document-ai/document-parse"

    headers = {"Authorization": f"Bearer {api_key}"}
    files = {"document": open(filename, "rb")}
    data = {"output_formats": "['markdown']"}

    response = requests.post(url, headers=headers, files=files, data=data)
    upstage_response = json.loads(response.text)
    return upstage_response

def get_md_with_document_parse(root_path, paper_url):
    response = requests.get(paper_url)
    # Save the PDF to a temporary file
    
    pdf_path = f"{root_path}/paper.pdf"
    with open(pdf_path, "wb") as f:
        f.write(response.content)
            
    split_factor = 1
    split_pdfs = split_pdf_by_pages(pdf_path, root_path, split_factor) # by 10

    markdown = ""
    total_responses = []
    for i, split_pdf in enumerate(split_pdfs):
        upstage_response = get_document_parse_response(split_pdf, UPSTAGE_API_KEY)
        
        # Append the response to the total_responses list
        total_responses.append({f"page_{i+1 * split_factor}": upstage_response})        
        # Also write the response to a JSON file for persistence
        json_output_path = f"{root_path}/response_{i+1}.json"
        with open(json_output_path, "w") as json_file:
            json.dump(upstage_response, json_file, indent=2)

        try:
            markdown += upstage_response['content']['markdown']
        except KeyError:
            pass
        
    return markdown

def get_md_from_fs(root_path):
    markdown = ""
    for file in os.listdir(root_path):
        if file.endswith(".json"):
            with open(os.path.join(root_path, file), "r") as f:
                upstage_response = json.load(f)
                markdown += upstage_response['content']['markdown']
    return markdown

def to_download_and_parse_paper_agent(paper_url: str):
    """Use this to download and parse paper only when paper URL is found."""
    paper_id = paper_url.split("/")[-1]
    root_path = paper_id

    if os.path.exists(root_path):
        print(f"Found cached markdown for {paper_id}")
        markdown = get_md_from_fs(root_path)
    else:
        print(f"No cached markdown found for {paper_id}, parsing from URL")
        os.makedirs(root_path, exist_ok=True)
        markdown = get_md_with_document_parse(root_path, paper_url)

    markdown = "Retrieved Paper Content\n-----------------------------------\n" + markdown
    markdown, _, _ = truncate_tokens_if_needed(tokenizer, message_template, markdown)
    return markdown

In [38]:
class Agent(BaseModel):
    name: str = "Agent"
    model: str = "solar-pro"
    instructions: str = "You are a helpful Agent"
    tools: list = []

client = OpenAI(
    base_url="https://api.upstage.ai/v1",
    api_key=UPSTAGE_API_KEY
)

supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent]
)

In [39]:
def run(client, messages, supervisor_agent):
    # Loop through the conversation steps
    while True:
        # Prepare tools for the current step
        tool_schemas = [function_to_schema(tool) for tool in supervisor_agent.tools]
        tools = {tool.__name__: tool for tool in supervisor_agent.tools}
        
        # Get model response
        response = client.chat.completions.create(
            model=supervisor_agent.model,
            messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
            tools=tool_schemas or None,
            tool_choice="auto",
        )
        
        if response.choices[0].message.tool_calls:
            print(response.choices[0].message.tool_calls)
        else:
            print("--------------------------------")
            print(response.choices[0].message.content)
            print("--------------------------------")
            break # escape the loop when there is no need for tool(function) call anymore
        
        # Add model response to messages
        messages.append(response.choices[0].message)
        
        # Add tool response to messages
        if response.choices[0].message.tool_calls:
            for tool_call in response.choices[0].message.tool_calls:
                tool_response = execute_tool_call(tool_call, tools, supervisor_agent.name)
                
                messages.append({
                    "role": "tool", 
                    "tool_call_id": tool_call.id, 
                    "content": tool_response
                })

In [42]:
messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='10c7b285-2f29-41a1-9714-b7ab7f9ca486', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='1cc34b5e-0822-4e7d-8991-cf0fba0aa44e', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})
No cached markdown found for 2502.00299, parsing from URL


Token indices sequence length is longer than the specified maximum sequence length for this model (57243 > 4096). Running this sequence through the model will result in indexing errors


Base token numbers: 115
Paper token numbers: 57243
Total token numbers: 57358
Truncated paper tokens to: 31885
Total token numbers: 32000
--------------------------------
The assistant should now reach the point in the conversation where it can provide a final answer based on the generated output of the function call.
--------------------------------


In [46]:
messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='3f6464ec-120f-41a0-8822-1a6db6c7ae82', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='a17098c5-3cbb-4393-88f0-a7908540bbb6', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})
Found cached markdown for 2502.00299
Base token numbers: 115
Paper token numbers: 57243
Total token numbers: 57358
Truncated paper tokens to: 31885
Total token numbers: 32000
--------------------------------
ChunkKV is a simple yet effective KV cache compression met

In [47]:
messages

[{'role': 'user',
  'content': "Provide a comprehensive summary of the paper, 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference' on arXiv. "},
 ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='3f6464ec-120f-41a0-8822-1a6db6c7ae82', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]),
 {'role': 'tool',
  'tool_call_id': '3f6464ec-120f-41a0-8822-1a6db6c7ae82',
  'content': "URL to download 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference': https://arxiv.org/pdf/2502.00299"},
 ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='a17098c5-3cbb-4393-88f0-a7908540bbb6', function=Func

## Vector Databased based Caching

In [None]:
!pip install chromadb

In [None]:
import json
import requests
import chromadb
import numpy as np
from PyPDF2 import PdfReader, PdfWriter
from chromadb import Documents, EmbeddingFunction, Embeddings

chroma_client = chromadb.PersistentClient(path="./chroma_db")
embedding_context_length = 4000

class UpstageEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(
        self,
        client,
        model_name: str = "embedding-query",
    ):
        self.client = client
        self.model_name = model_name

    def __call__(self, input: Documents) -> Embeddings:
        if not all(isinstance(item, str) for item in input):
            raise ValueError("Solar embedding only supports text documents, not images")

        batch_process_result = self.client.embeddings.create(model=self.model_name, input=input).data
        passage_embedding_list = [i.embedding for i in batch_process_result]
        return np.array(passage_embedding_list, dtype=np.float32)

embedding_fn = UpstageEmbeddingFunction(client)

def get_md_with_document_parse(root_path, paper_url, paper_id):
    response = requests.get(paper_url)
    # Save the PDF to a temporary file
    
    pdf_path = f"{root_path}/paper.pdf"
    with open(pdf_path, "wb") as f:
        f.write(response.content)
            
    split_factor = 1
    split_pdfs = split_pdf_by_pages(pdf_path, root_path, split_factor) # by 10

    markdown = ""
    total_responses = []
    for i, split_pdf in enumerate(split_pdfs):
        upstage_response = get_document_parse_response(split_pdf, UPSTAGE_API_KEY)
        
        # Append the response to the total_responses list
        total_responses.append({f"page_{i+1 * split_factor}": upstage_response})        
        # Also write the response to a JSON file for persistence
        json_output_path = f"{root_path}/response_{i+1}.json"
        with open(json_output_path, "w") as json_file:
            json.dump(upstage_response, json_file, indent=2)

        try:
            markdown += upstage_response['content']['markdown']
        except KeyError:
            pass
    
    collection = chroma_client.create_collection(name=paper_id, embedding_function=embedding_fn)

    processed_input = []
    if len(markdown) > embedding_context_length:
        chunks = [markdown[i:i+embedding_context_length] for i in range(0, len(markdown), embedding_context_length)]
        processed_input.extend(chunks)
    else:
        processed_input.append(markdown)
    
    ids = []
    for i in range(len(processed_input)):
        ids.append(f"{paper_id}_{i}")
        
    collection.add(documents=processed_input, ids=ids)
    return collection

def to_download_and_parse_paper_agent(paper_url: str):
    """Use this to download and parse paper only when paper URL is found."""
    paper_id = paper_url.split("/")[-1]
    root_path = paper_id

    if os.path.exists(root_path):
        print(f"Found cached markdown for {paper_id}")
        return f"we already have the paper content stored in our database in the id of {paper_id}"
        # chunks = get_md_from_fs(paper_id)
    else:
        print(f"No cached markdown found for {paper_id}, parsing from URL")
        os.makedirs(root_path, exist_ok=True)
        collection = get_md_with_document_parse(root_path, paper_url, paper_id)
        return f"we have parsed the paper content and stored in our database in the id of {paper_id}"
    
def to_retrive_paper_content_to_answer_question_agent(question: str, paper_id: str):
    """Use this to answer question about the paper."""
    collection = chroma_client.get_collection(name=paper_id, embedding_function=embedding_fn)
    results = collection.query(query_texts=[question], n_results=10)
    results_str = ["Retrieved Paper Content\n-----------------------------------\n"]
    for i in range(len(results['documents'])):
        results_str.append(f"{i}: {results['documents'][i]}")
    return "\n".join(results_str)


In [89]:
supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent, to_retrive_paper_content_to_answer_question_agent]
)

In [94]:
messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='4b81dad7-1328-4d99-9505-e5e40a64bb77', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='3a37b7a0-4a91-4f09-833a-61e30f68fd67', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})
Found cached markdown for 2502.00299
[ChatCompletionMessageToolCall(id='5703eec3-13bc-48fc-88db-e88418a7e166', function=Function(arguments='{"paper_id":"2502.00299","question":"Provide a comprehensive summary of the paper."}', name='to_retrive_paper_content_to_answe