In [45]:
import os
import json
import inspect
import requests
from io import BytesIO

import PyPDF2
from openai import OpenAI
from pydantic import BaseModel

from dotenv import load_dotenv

## Setting up the API Keys

In [46]:
load_dotenv()

True

In [47]:
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
UPSTAGE_API_KEY = os.getenv("UPSTAGE_API_KEY")

## Defining Helper Functions

In [48]:
def function_to_schema(func) -> dict:
    type_map = {
        str: "string",
        int: "integer",
        float: "number",
        bool: "boolean",
        list: "array",
        dict: "object",
        type(None): "null",
    }

    try:
        signature = inspect.signature(func)
    except ValueError as e:
        raise ValueError(
            f"Failed to get signature for function {func.__name__}: {str(e)}"
        )

    parameters = {}
    for param in signature.parameters.values():
        try:
            param_type = type_map.get(param.annotation, "string")
        except KeyError as e:
            raise KeyError(
                f"Unknown type annotation {param.annotation} for parameter {param.name}: {str(e)}"
            )
        parameters[param.name] = {"type": param_type}

    required = [
        param.name
        for param in signature.parameters.values()
        if param.default == inspect._empty
    ]

    return {
        "type": "function",
        "function": {
            "name": func.__name__,
            "description": (func.__doc__ or "").strip(),
            "parameters": {
                "type": "object",
                "properties": parameters,
                "required": required,
            },
        },
    }
    
def execute_tool_call(tool_call, tools, agent_name):
    name = tool_call.function.name
    args = json.loads(tool_call.function.arguments)

    print(f"{agent_name}:", f"{name}({args})")

    return tools[name](**args)  # call corresponding function with provided arguments

## Testing the Tool Calling

In [49]:
class Agent(BaseModel):
    name: str = "Agent"
    model: str = "gpt-4o"
    instructions: str = "You are a helpful Agent"
    tools: list = []
    
client = OpenAI(api_key=OPENAI_API_KEY)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

In [50]:
def to_paper_search_agent():
    """Use this to search for paper URL on arXiv only when paper URL is not found yet."""
    return ""

def to_download_and_parse_paper_agent():
    """Use this to download and parse paper only when paper URL is found."""
    return ""

def to_paper_analysis_agent():
    """Use this to analyze only when the contnet of paper(text) is found.
Paper content is indicated by "Retrieved Paper Content"
"""
    return ""

supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent]
)

tool_schemas = [function_to_schema(tool) for tool in supervisor_agent.tools]
tools = {tool.__name__: tool for tool in supervisor_agent.tools}

In [51]:
# Initial trial
response = client.chat.completions.create(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

None
[ChatCompletionMessageToolCall(id='call_DBx9Z4YlhZMaSDHEdB0JB5b9', function=Function(arguments='{}', name='to_paper_search_agent'), type='function')]


In [52]:
# Subsequent trial (Second)
messages.append(response.choices[0].message)
messages.append(
    {
        "role": "tool", 
        "tool_call_id": response.choices[0].message.tool_calls[0].id,
        "content": "Paper URL: https://arxiv.org/abs/2502.00299"
    }
)

response = client.chat.completions.create(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

None
[ChatCompletionMessageToolCall(id='call_QN28xcbOxhNPwACXdMsUb06B', function=Function(arguments='{}', name='to_download_and_parse_paper_agent'), type='function')]


In [53]:
# Subsequent trial (Third)
messages.append(response.choices[0].message)
messages.append(
    {
        "role": "tool", 
        "tool_call_id": response.choices[0].message.tool_calls[0].id, 
        "content": "Retrieved Paper Content\n"
        "--------------------------------\n"
        "Title: ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference\n"
        "To reduce memory costs in long-context inference with Large Language Models (LLMs), many recent works focus on compressing the key-value (KV) cache of different tokens. However, we identify that the previous KV cache compression methods measure token importance individually, neglecting the dependency between different tokens in the real-world language characterics. In light of this, we introduce ChunkKV, grouping the tokens in a chunk as a basic compressing unit, and retaining the most informative semantic chunks while discarding the less important ones. Furthermore, observing that ChunkKV exhibits higher similarity in the preserved indices across different layers, we propose layer-wise index reuse to further reduce computational overhead. We evaluated ChunkKV on cutting-edge long-context benchmarks including LongBench and Needle-In-A-HayStack, as well as the GSM8K and JailbreakV in-context learning benchmark. Our experiments with instruction tuning and multi-step reasoning (O1 and R1) LLMs, achieve up to 10\% performance improvement under aggressive compression ratios compared to existing methods."
    }
)

response = client.chat.completions.create(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

The paper "ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference" introduces a novel method, ChunkKV, to reduce memory costs associated with long-context inference in Large Language Models (LLMs). Previous methods focused on compressing key-value (KV) caches based on individual token importance, but they often overlooked the interaction and dependency among different tokens in natural language. 

ChunkKV addresses this by grouping tokens into chunks, treating each chunk as a unit for compression. This approach allows ChunkKV to preserve the most informative semantic chunks while discarding less important ones. The method also introduces layer-wise index reuse, leveraging the observed similarity in preserved indices across different layers to further decrease computational costs.

The efficacy of ChunkKV was tested on several benchmarks, including LongBench, Needle-In-A-HayStack, GSM8K, and JailbreakV in-context learning benchmarks. The results demon

## Filling in the Dummy Tools (Functions)

In [54]:
class Agent(BaseModel):
    name: str = "Agent"
    model: str = "gpt-4o"
    instructions: str = "You are a helpful Agent"
    tools: list = []
    
client = OpenAI(api_key=OPENAI_API_KEY)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

In [55]:
def to_paper_search_agent(paper_title: str):
    """Use this to search for paper URL on arXiv only when paper URL is not found yet."""
    url = "https://google.serper.dev/search"

    payload = json.dumps({"q": f"{paper_title} on arXiv"})
    headers = {
        'X-API-KEY': SERPER_API_KEY,
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    search_results = response.json()['organic']
    
    if len(search_results) == 0:
        return "Count not find the URL to download the paper"
    
    first_result = search_results[0]
    if not first_result['link'].startswith("https://arxiv.org"):
        return "Could not find the URL to download the paper"
    
    return f"URL to download '{paper_title}': {first_result['link'].replace('abs', 'pdf')}"

def to_download_and_parse_paper_agent(paper_url: str):
    """Use this to download and parse paper only when paper URL is found."""
    response = requests.get(paper_url)
    pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
    text = "Retrieved Paper Content\n-----------------------------------\n"
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text.strip()[:10000]

supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent]#, to_paper_analysis_agent]#, to_triage, to_end_agent],
)


In [56]:
def run(client, messages, supervisor_agent):
    # Loop through the conversation steps
    while True:
        # Prepare tools for the current step
        tool_schemas = [function_to_schema(tool) for tool in supervisor_agent.tools]
        tools = {tool.__name__: tool for tool in supervisor_agent.tools}
        
        # Get model response
        response = client.chat.completions.create(
            model=supervisor_agent.model,
            messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
            tools=tool_schemas or None,
            tool_choice="auto",
        )
        
        if response.choices[0].message.tool_calls:
            print(response.choices[0].message.tool_calls)
        else:
            print("--------------------------------")
            print(response.choices[0].message.content)
            print("--------------------------------")
            break
        
        # Add model response to messages
        messages.append(response.choices[0].message)
        
        # Add tool response to messages
        if response.choices[0].message.tool_calls:
            for tool_call in response.choices[0].message.tool_calls:
                tool_response = execute_tool_call(tool_call, tools, supervisor_agent.name)
                
                messages.append({
                    "role": "tool", 
                    "tool_call_id": tool_call.id, 
                    "content": tool_response
                })

In [57]:
run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='call_td4d3gSzmIjXLckS2b5m0D8z', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='call_zg8EX15mTG0LBnGf4Op3EcrA', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})
--------------------------------
The paper titled "ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference" presents a new method for reducing memory costs in long-context inference with Large Language Models (LLMs). Here’s a comprehensive summa

## Try the same with Upstage's Solar-Pro 

In [59]:
supervisor_agent.model = "solar-pro"

client = OpenAI(
    base_url="https://api.upstage.ai/v1",
    api_key=UPSTAGE_API_KEY
)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]

run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='3e45e34a-a053-4b8a-9425-2fb99aa4ff95', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='82640c4a-0263-4b0f-b646-2d52a50080ec', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})
--------------------------------
The paper 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference' presents a novel approach to KV cache compression that aims to preserve semantic information in the cache. The authors propose a 

## Using Document Parse as a Tool

In [60]:
import os
os.environ["PATH"] = "/opt/homebrew/bin/:" + os.environ["PATH"]

In [61]:
def truncate_tokens_if_needed(tokenizer, messages, content, max_token_limit=32000):
    """
    Truncate the markdown content if the total tokens exceed the maximum limit.
    
    Args:
        tokenizer: The tokenizer to use for encoding/decoding
        messages: List of message dictionaries for the conversation
        content: The markdown content to potentially truncate
        max_token_limit: Maximum token limit (default: 32000)
        
    Returns:
        truncated_markdown: The potentially truncated markdown
        base_token_numbers: Number of tokens in the base conversation
        paper_token_numbers: Number of tokens in the paper after potential truncation
    """
    inputs = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": supervisor_agent.instructions}
        ] + messages
    )
    base_token_numbers = len(inputs)
    encoded_content = tokenizer.encode(content)
    paper_token_numbers = len(encoded_content)

    print(f"Base token numbers: {base_token_numbers}")
    print(f"Paper token numbers: {paper_token_numbers}")
    print(f"Total token numbers: {base_token_numbers + paper_token_numbers}")

    total_token_numbers = base_token_numbers + paper_token_numbers

    if total_token_numbers > max_token_limit:
        # Calculate how many tokens we need to truncate
        tokens_to_keep = max_token_limit - base_token_numbers
        # Truncate the encoded markdown
        encoded_content = encoded_content[:tokens_to_keep]
        # Update the paper token count
        paper_token_numbers = len(encoded_content)
        # Update the markdown string by decoding the truncated tokens
        truncated_content = tokenizer.decode(encoded_content)
        print(f"Truncated paper tokens to: {paper_token_numbers}")
    else:
        print("No truncation needed")
        truncated_content = content

    print(f"Total token numbers: {base_token_numbers + paper_token_numbers}")
    return truncated_content, base_token_numbers, paper_token_numbers

In [65]:
import json
import shutil
import requests
from PyPDF2 import PdfReader, PdfWriter

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("upstage/solar-pro-preview-instruct")
message_template = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper below, \n"
    },
]

def split_pdf_by_pages(input_pdf_path, root_path, pages_per_pdf=10):
    # Open the PDF
    pdf = PdfReader(input_pdf_path)
    total_pages = len(pdf.pages)
    
    # Calculate number of output PDFs needed
    num_pdfs = (total_pages + pages_per_pdf - 1) // pages_per_pdf
    
    output_paths = []
    
    # Create output directory using input filename
    base_name = os.path.splitext(input_pdf_path)[0]
    os.makedirs(base_name, exist_ok=True)
    
    # Split into multiple PDFs
    for i in range(num_pdfs):
        writer = PdfWriter()
        
        # Calculate start and end pages for this split
        start_page = i * pages_per_pdf
        end_page = min((i + 1) * pages_per_pdf, total_pages)
        
        # Add pages to writer
        for page_num in range(start_page, end_page):
            writer.add_page(pdf.pages[page_num])
            
        # Save the split PDF
        output_path = f"{root_path}/{i+1}.pdf"
        with open(output_path, "wb") as output_file:
            writer.write(output_file)
        output_paths.append(output_path)
        
    return output_paths

def get_document_parse_response(filename, api_key):
    url = "https://api.upstage.ai/v1/document-ai/document-parse"

    headers = {"Authorization": f"Bearer {api_key}"}
    files = {"document": open(filename, "rb")}
    data = {"output_formats": "['markdown']"}

    response = requests.post(url, headers=headers, files=files, data=data)
    upstage_response = json.loads(response.text)
    return upstage_response

def to_download_and_parse_paper_agent(paper_url: str):
    """Use this to download and parse paper only when paper URL is found."""
    response = requests.get(paper_url)
    # Save the PDF to a temporary file
    root_path = "tmp"
    temp_pdf_path = "temp_paper.pdf"
    with open(temp_pdf_path, "wb") as f:
        f.write(response.content)

    shutil.rmtree(root_path, ignore_errors=True)
    os.makedirs(root_path, exist_ok=True)

    split_factor = 1
    split_pdfs = split_pdf_by_pages(temp_pdf_path, root_path, split_factor) # by 10

    markdown = ""
    total_responses = []
    for i, split_pdf in enumerate(split_pdfs):
        upstage_response = get_document_parse_response(split_pdf, UPSTAGE_API_KEY)
        
        # Append the response to the total_responses list
        total_responses.append({f"page_{i+1 * split_factor}": upstage_response})        
        # Also write the response to a JSON file for persistence
        json_output_path = f"{root_path}/response_{i+1}.json"
        with open(json_output_path, "w") as json_file:
            json.dump(upstage_response, json_file, indent=2)

        try:
            markdown += upstage_response['content']['markdown']
        except KeyError:
            pass

    markdown = "Retrieved Paper Content\n-----------------------------------\n" + markdown
    markdown, _, _ = truncate_tokens_if_needed(tokenizer, message_template, markdown)
    return markdown

In [66]:
supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent]#, to_paper_analysis_agent]#, to_triage, to_end_agent],
)

In [67]:
supervisor_agent.model = "solar-pro"

client = OpenAI(
    base_url="https://api.upstage.ai/v1",
    api_key=UPSTAGE_API_KEY
)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]

run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='9521a151-6c08-463b-a377-2e2c0df1563d', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='f85864f5-f429-46dd-a900-cc97f7773ed8', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})


Token indices sequence length is longer than the specified maximum sequence length for this model (53672 > 4096). Running this sequence through the model will result in indexing errors


Base token numbers: 115
Paper token numbers: 53672
Total token numbers: 53787
Truncated paper tokens to: 31885
Total token numbers: 32000
--------------------------------
The assistant summarizes the key findings from the experiments and analysis conducted on the ChunkKV method, which is a novel KV cache compression technique for efficient long-context inference in large language models (LLMs). The method retains the most informative semantic chunks from the original KV cache, leading to improved performance compared to existing methods. The experiments were conducted on various LLMs and benchmarks, demonstrating the effectiveness of ChunkKV in preserving essential contextual information for complex reasoning tasks, long-context understanding, and safety evaluations. The method’s chunk-based approach maintains crucial contextual information, leading to superior performance in challenging scenarios and benchmarks. The proposed layer-wise index reuse technique provides significant comput