In [6]:
import os
import json
import inspect
import requests
from io import BytesIO

import PyPDF2
from mistralai import Mistral
from pydantic import BaseModel

from dotenv import load_dotenv

## Setting up the API Keys

In [7]:
load_dotenv()

True

In [8]:
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

## Defining Helper Functions

In [9]:
def function_to_schema(func) -> dict:
    type_map = {
        str: "string",
        int: "integer",
        float: "number",
        bool: "boolean",
        list: "array",
        dict: "object",
        type(None): "null",
    }

    try:
        signature = inspect.signature(func)
    except ValueError as e:
        raise ValueError(
            f"Failed to get signature for function {func.__name__}: {str(e)}"
        )

    parameters = {}
    for param in signature.parameters.values():
        try:
            param_type = type_map.get(param.annotation, "string")
        except KeyError as e:
            raise KeyError(
                f"Unknown type annotation {param.annotation} for parameter {param.name}: {str(e)}"
            )
        parameters[param.name] = {"type": param_type}

    required = [
        param.name
        for param in signature.parameters.values()
        if param.default == inspect._empty
    ]

    return {
        "type": "function",
        "function": {
            "name": func.__name__,
            "description": (func.__doc__ or "").strip(),
            "parameters": {
                "type": "object",
                "properties": parameters,
                "required": required,
            },
        },
    }
    
def execute_tool_call(tool_call, tools, agent_name):
    name = tool_call.function.name
    args = json.loads(tool_call.function.arguments)

    print(f"{agent_name}:", f"{name}({args})")

    return tools[name](**args)  # call corresponding function with provided arguments

def execute_tool_call(tool_call_str, tools, agent_name):
    name = tool_call_str[0]["name"]
    args = tool_call_str[0]["arguments"]

    print(f"{agent_name}:", f"{name}({args})")

    return tools[name](**args)

## Testing the Tool Calling

In [46]:
class Agent(BaseModel):
    name: str = "Agent"
    model: str = "mistral-large-latest"
    instructions: str = "You are a helpful Agent"
    tools: list = []
    
client = Mistral(api_key=MISTRAL_API_KEY)

messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

In [64]:
def to_paper_search_agent(paper_title: str):
    """Use this to search for paper URL on arXiv only when paper URL is not found yet."""
    return ""

def to_download_and_parse_paper_agent(paper_url: str):
    """Use this to download and parse paper only when paper URL is found."""
    return ""

supervisor_agent = Agent(
    name="Supervisor Agent",
    instructions=(
        "You are a academic paper analyzer. "
        "- Basiclly, you don't have knowledge of the requested paper."
        "- Hence, you need to use the provided tools to get the paper information from the internet. "
        "- Your job is to find appropriate tool to transfer to based on the user's request and results of tool calls. "
        "- If enough information is collected to complete the user request, you should say directly answer to the user request. "
    ),
    tools=[to_paper_search_agent, to_download_and_parse_paper_agent]
)

tool_schemas = [function_to_schema(tool) for tool in supervisor_agent.tools]
tools = {tool.__name__: tool for tool in supervisor_agent.tools}

In [65]:
messages = [
    {
        "role": "user",
        "content": "Provide a comprehensive summary of the paper, "
                   "'ChunkKV - Semantic-Preserving KV Cache Compression "
                   "for Efficient Long-Context LLM Inference' on arXiv. "
    },
]    

# Initial trial
response = client.chat.complete(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas,
    tool_choice="any",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)


[ToolCall(function=FunctionCall(name='to_paper_search_agent', arguments='{"paper_title": "ChunkKV - Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference"}'), id='JAUrxeRpg', type=None, index=0)]


In [66]:
# Subsequent trial (Second)
messages.append(response.choices[0].message)
messages.append(
    {
        "role": "tool", 
        "name": response.choices[0].message.tool_calls[0].function.name,
        "tool_call_id": response.choices[0].message.tool_calls[0].id,
        "content": "Paper URL: https://arxiv.org/abs/2502.00299"
    }
)

response = client.chat.complete(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

[{"name": "to_download_and_parse_paper_agent", "arguments": {"paper_url": "https://arxiv.org/abs/2502.00299"}}]
None


In [72]:
alt_fn_call = json.loads(response.choices[0].message.content)

In [73]:
alt_fn_call

[{'name': 'to_download_and_parse_paper_agent',
  'arguments': {'paper_url': 'https://arxiv.org/abs/2502.00299'}}]

In [77]:
# def execute_tool_call(tool_call, tools, agent_name):
name = alt_fn_call[0]["name"]
print(name)
args = alt_fn_call[0]["arguments"]
print(args)

tools[name](**args)

# print(f"{agent_name}:", f"{name}({args})")

# return tools[name](**args)  # call corresponding function with provided arguments

to_download_and_parse_paper_agent
{'paper_url': 'https://arxiv.org/abs/2502.00299'}


In [58]:
# Subsequent trial (Third)
messages.append(response.choices[0].message)
messages.append(
    {
        "role": "tool", 
        "name": response.choices[0].message.tool_calls[0].function.name,
        "tool_call_id": response.choices[0].message.tool_calls[0].id, 
        "content": "Retrieved Paper Content\n"
        "--------------------------------\n"
        "Title: ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference\n"
        "To reduce memory costs in long-context inference with Large Language Models (LLMs), many recent works focus on compressing the key-value (KV) cache of different tokens. However, we identify that the previous KV cache compression methods measure token importance individually, neglecting the dependency between different tokens in the real-world language characterics. In light of this, we introduce ChunkKV, grouping the tokens in a chunk as a basic compressing unit, and retaining the most informative semantic chunks while discarding the less important ones. Furthermore, observing that ChunkKV exhibits higher similarity in the preserved indices across different layers, we propose layer-wise index reuse to further reduce computational overhead. We evaluated ChunkKV on cutting-edge long-context benchmarks including LongBench and Needle-In-A-HayStack, as well as the GSM8K and JailbreakV in-context learning benchmark. Our experiments with instruction tuning and multi-step reasoning (O1 and R1) LLMs, achieve up to 10\% performance improvement under aggressive compression ratios compared to existing methods."
    }
)

response = client.chat.complete(
    model=supervisor_agent.model,
    messages=[{"role": "system", "content": supervisor_agent.instructions}] + messages,
    tools=tool_schemas or None,
    tool_choice="auto",
)
print(response.choices[0].message.content)
print(response.choices[0].message.tool_calls)

The paper titled 'ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference' introduces a novel approach to reduce memory costs in long-context inference with Large Language Models (LLMs). The authors identify that previous KV cache compression methods measure token importance individually, neglecting the dependency between different tokens in real-world language characteristics. To address this, they propose ChunkKV, which groups tokens in chunks as a basic compressing unit and retains the most informative semantic chunks while discarding the less important ones. Additionally, ChunkKV exhibits higher similarity in the preserved indices across different layers, leading to the proposal of layer-wise index reuse to further reduce computational overhead. The evaluation of ChunkKV on long-context benchmarks, including LongBench and Needle-In-A-HayStack, as well as the GSM8K and JailbreakV in-context learning benchmark, demonstrates up to a 10% performance i