In [6]:
import os
import json
import inspect
import requests
from io import BytesIO

import PyPDF2
from openai import OpenAI
from pydantic import BaseModel

from dotenv import load_dotenv

## Setting up the API Keys

In [24]:
load_dotenv()

True

In [32]:
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
UPSTAGE_API_KEY = os.getenv("UPSTAGE_API_KEY")

## Defining Helper Functions

In [15]:
def function_to_schema(func) -> dict:
    type_map = {
        str: "string",
        int: "integer",
        float: "number",
        bool: "boolean",
        list: "array",
        dict: "object",
        type(None): "null",
    }

    try:
        signature = inspect.signature(func)
    except ValueError as e:
        raise ValueError(
            f"Failed to get signature for function {func.__name__}: {str(e)}"
        )

    parameters = {}
    for param in signature.parameters.values():
        try:
            param_type = type_map.get(param.annotation, "string")
        except KeyError as e:
            raise KeyError(
                f"Unknown type annotation {param.annotation} for parameter {param.name}: {str(e)}"
            )
        parameters[param.name] = {"type": param_type}

    required = [
        param.name
        for param in signature.parameters.values()
        if param.default == inspect._empty
    ]

    return {
        "type": "function",
        "function": {
            "name": func.__name__,
            "description": (func.__doc__ or "").strip(),
            "parameters": {
                "type": "object",
                "properties": parameters,
                "required": required,
            },
        },
    }
    
def execute_tool_call(tool_call, tools, agent_name):
    name = tool_call.function.name
    args = json.loads(tool_call.function.arguments)

    print(f"{agent_name}:", f"{name}({args})")

    return tools[name](**args)  # call corresponding function with provided arguments

## Testing the Tool Calling

In [16]:
class Agent(BaseModel):
    name: str = "Agent"
    model: str = "gpt-4o"
    instructions: str = "You are a helpful Agent"
    tools: list = []
    
client = OpenAI(api_key=OPENAI_API_KEY)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

In [17]:
def to_paper_search_agent():
    """Use this to search for paper URL on arXiv only when paper URL is not found yet."""
    return ""

def to_download_and_parse_paper_agent():
    """Use this to download and parse paper only when paper URL is found."""
    return ""

def to_paper_analysis_agent():
    """Use this to analyze only when the contnet of paper(text) is found.
Paper content is indicated by "Retrieved Paper Content"
"""
    return ""

supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent]
)

tool_schemas = [function_to_schema(tool) for tool in supervisor_agent.tools]
tools = {tool.__name__: tool for tool in supervisor_agent.tools}

In [None]:
# Initial trial
response = client.chat.completions.create(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

None
[ChatCompletionMessageToolCall(id='call_PoWiNSPjypnstlOPWwElOPyj', function=Function(arguments='{}', name='to_paper_search_agent'), type='function')]


In [19]:
# Subsequent trial (Second)
messages.append(response.choices[0].message)
messages.append(
    {
        "role": "tool", 
        "tool_call_id": response.choices[0].message.tool_calls[0].id,
        "content": "Paper URL: https://arxiv.org/abs/2502.00299"
    }
)

response = client.chat.completions.create(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

None
[ChatCompletionMessageToolCall(id='call_doRYRQFjppzZzQ5FMKbeKeJV', function=Function(arguments='{}', name='to_download_and_parse_paper_agent'), type='function')]


In [20]:
# Subsequent trial (Third)
messages.append(response.choices[0].message)
messages.append(
    {
        "role": "tool", 
        "tool_call_id": response.choices[0].message.tool_calls[0].id, 
        "content": "Retrieved Paper Content\n"
        "--------------------------------\n"
        "Title: ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference\n"
        "To reduce memory costs in long-context inference with Large Language Models (LLMs), many recent works focus on compressing the key-value (KV) cache of different tokens. However, we identify that the previous KV cache compression methods measure token importance individually, neglecting the dependency between different tokens in the real-world language characterics. In light of this, we introduce ChunkKV, grouping the tokens in a chunk as a basic compressing unit, and retaining the most informative semantic chunks while discarding the less important ones. Furthermore, observing that ChunkKV exhibits higher similarity in the preserved indices across different layers, we propose layer-wise index reuse to further reduce computational overhead. We evaluated ChunkKV on cutting-edge long-context benchmarks including LongBench and Needle-In-A-HayStack, as well as the GSM8K and JailbreakV in-context learning benchmark. Our experiments with instruction tuning and multi-step reasoning (O1 and R1) LLMs, achieve up to 10\% performance improvement under aggressive compression ratios compared to existing methods."
    }
)

response = client.chat.completions.create(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

The paper "ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference" addresses the challenge of reducing memory costs during long-context inference with large language models (LLMs). Traditional methods for compressing the key-value (KV) cache often evaluate token importance in isolation, which overlooks the interdependencies inherent in natural language. The authors introduce ChunkKV, a novel approach that clusters tokens into chunks to use as compression units, prioritizing the retention of semantically informative chunks while discarding less crucial ones.

The ChunkKV method capitalizes on the observed similarity in the indices preserved across different layers, allowing for layer-wise index reuse to minimize computational demands. The researchers evaluated ChunkKV using state-of-the-art long-context benchmarks such as LongBench and Needle-In-A-Haystack, along with in-context learning benchmarks like GSM8K and JailbreakV. The conducted experiments,

## Filling in the Dummy Tools (Functions)

In [38]:
class Agent(BaseModel):
    name: str = "Agent"
    model: str = "gpt-4o"
    instructions: str = "You are a helpful Agent"
    tools: list = []
    
client = OpenAI(api_key=OPENAI_API_KEY)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

In [39]:
def to_paper_search_agent(paper_title: str):
    """Use this to search for paper URL on arXiv only when paper URL is not found yet."""
    url = "https://google.serper.dev/search"

    payload = json.dumps({"q": f"{paper_title} on arXiv"})
    headers = {
        'X-API-KEY': SERPER_API_KEY,
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    search_results = response.json()['organic']
    
    if len(search_results) == 0:
        return "Count not find the URL to download the paper"
    
    first_result = search_results[0]
    if not first_result['link'].startswith("https://arxiv.org"):
        return "Could not find the URL to download the paper"
    
    return f"URL to download '{paper_title}': {first_result['link'].replace('abs', 'pdf')}"

def to_download_and_parse_paper_agent(paper_url: str):
    """Use this to download and parse paper only when paper URL is found."""
    response = requests.get(paper_url)
    pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
    text = "Retrieved Paper Content\n-----------------------------------\n"
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text.strip()[:10000]

supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent]#, to_paper_analysis_agent]#, to_triage, to_end_agent],
)


In [40]:
def run(client, messages, supervisor_agent):
    # Loop through the conversation steps
    while True:
        # Prepare tools for the current step
        tool_schemas = [function_to_schema(tool) for tool in supervisor_agent.tools]
        tools = {tool.__name__: tool for tool in supervisor_agent.tools}
        
        # Get model response
        response = client.chat.completions.create(
            model=supervisor_agent.model,
            messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
            tools=tool_schemas or None,
            tool_choice="auto",
        )
        
        if response.choices[0].message.tool_calls:
            print(response.choices[0].message.tool_calls)
        else:
            print("--------------------------------")
            print(response.choices[0].message.content)
            print("--------------------------------")
            break
        
        # Add model response to messages
        messages.append(response.choices[0].message)
        
        # Add tool response to messages
        if response.choices[0].message.tool_calls:
            for tool_call in response.choices[0].message.tool_calls:
                tool_response = execute_tool_call(tool_call, tools, supervisor_agent.name)
                
                messages.append({
                    "role": "tool", 
                    "tool_call_id": tool_call.id, 
                    "content": tool_response
                })

In [41]:
run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='call_a7iOBDwap3BKsI42WVqwlSBm', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='call_cVrZIdsNWb1LQwLyoqa5vxoE', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})
--------------------------------
The paper "ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference" presents a novel method to address the memory constraints in Large Language Models (LLMs) when inferring long contexts. The work primarily focus

## Try the same with Upstage's Solar-Pro 

In [43]:
supervisor_agent.model = "solar-pro"

client = OpenAI(
    base_url="https://api.upstage.ai/v1",
    api_key=UPSTAGE_API_KEY
)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]

run(client, messages, supervisor_agent)

[ChatCompletionMessageToolCall(id='5b2c1208-db8d-4410-b7b8-6be146b00008', function=Function(arguments='{"paper_title":"ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}', name='to_paper_search_agent'), type='function')]
Supervisor Agent: to_paper_search_agent({'paper_title': 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'})
[ChatCompletionMessageToolCall(id='bab6f011-f4af-4d00-bd40-fa9e0f46f737', function=Function(arguments='{"paper_url":"https://arxiv.org/pdf/2502.00299"}', name='to_download_and_parse_paper_agent'), type='function')]
Supervisor Agent: to_download_and_parse_paper_agent({'paper_url': 'https://arxiv.org/pdf/2502.00299'})
--------------------------------
The paper titled 'ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference' is a comprehensive study on the optimization of Large Language Models (LLMs) for long-context inference. The authors propo