In [1]:
import pandas as pd 
import numpy as np 
import json
import time
import os
import tiktoken
from openai import OpenAI 
from tqdm import tqdm 
from concurrent.futures import ThreadPoolExecutor, as_completed

client = OpenAI() # uses OPENAI_API_KEY env var

## Experimentation Process: 
#### First Case: At least 1 correct tool in the pool 
1. Pick a sample size _s_. Then, draw _s_-1 tools at random, and the _ground_truth tool_ into one set. Experiment with different _s_.
2. Have an LLM attempt tool calling. Record result as well as latency.
3. Perform Cosine similarity search over normalized vector space of tool definitions. Retrieve top-k (exp. w/ diff. _k_) and attempt step 2 again. Record result as well as latency.
4. Compare differences in accuracy and latency.

#### Second Case: Possibility of 0 correct tools in the pool
...

### Notes
- `ground_truth` is always of length 1.
- some tools are duplicates, but I think their descriptions slightly change according to the task, so I will not remove them based on names (without conditioning on the ground truth)

In [2]:
s = 50 # LLM makes choice out of s potential tools
k = 5 # LLM is given the top-k most semantically 'relevant' tools

In [3]:
tool_df = pd.read_pickle('data/ntokens_embeddings_tool_df.pkl')
eval_df = pd.read_pickle('data/multiple_tools.pkl')

In [4]:
eval_df

Unnamed: 0,id,question,function,ground_truth
0,live_multiple_0-0-0,"[[{'role': 'user', 'content': 'update my latte...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['latte'...
1,live_multiple_1-0-1,"[[{'role': 'system', 'content': 'You are an ag...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['1234']...
2,live_multiple_2-1-0,"[[{'role': 'user', 'content': 'Tôi cần một chu...","[{'name': 'uber_ride', 'description': 'Tìm chu...","[{'uber_ride': {'loc': ['2150 Shattuck Ave, Be..."
3,live_multiple_3-2-0,"[[{'role': 'user', 'content': 'Get weather of ...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'api_weather': {'loc': ['Ha Noi, Vietnam']}}]"
4,live_multiple_4-2-1,"[[{'role': 'user', 'content': 'Tìm chuyến xe c...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'uber_ride': {'loc': ['123 Hanoi Street'], '..."
...,...,...,...,...
1048,live_multiple_1048-275-0,"[[{'role': 'user', 'content': 'I need to find ...","[{'name': 'Hotels_2_BookHouse', 'description':...",[{'Hotels_2_SearchHouse': {'where_to': ['Paris...
1049,live_multiple_1049-276-0,"[[{'role': 'user', 'content': 'Can you find me...","[{'name': 'Events_3_FindEvents', 'description'...","[{'Trains_1_FindTrains': {'_from': ['Anaheim, ..."
1050,live_multiple_1050-277-0,"[[{'role': 'user', 'content': 'Can you tell me...","[{'name': 'RideSharing_2_GetRide', 'descriptio...",[{'Weather_1_GetWeather': {'city': ['Atlanta']...
1051,live_multiple_1051-278-0,"[[{'role': 'user', 'content': 'I need to wake ...","[{'name': 'set_alarm', 'description': 'Set an ...",[{'set_alarm': {'alarm_time': ['2023-12-01 07:...


In [5]:
correct_tool = eval_df['ground_truth'].apply(lambda x: [s for item in x for s in item.keys()]) # This scales to when ground_truth > 1
eval_df['correct_tool'] = correct_tool

In [6]:
llm_only_results_df = pd.DataFrame()

### Case 1: At least one correct tool

#### Naive Approach (No Selection Process)

In [7]:
def generate_sample(s: int, correct_tools: list[str], tool_df: pd.DataFrame) -> pd.DataFrame:
    
    # Verify ground_truth tools exist 
    if not all(tool in tool_df['name'].values for tool in correct_tools): 
        raise ValueError("The necessary tools are not here? Unknown tool(s) being referenced: ", correct_tools)

    # Drop duplicates
    tool_df_without_duplicates = tool_df.drop_duplicates(subset=['name'])

    # Take ground_truth row(s)
    correct_rows = tool_df_without_duplicates.loc[tool_df_without_duplicates['name'].isin(correct_tools)]

    # Grab random tools to fill the pool up to size s
    available_df = tool_df_without_duplicates[~tool_df_without_duplicates['name'].isin(correct_tools)]
    random_idx = np.random.choice(len(available_df), size=s - len(correct_tools), replace=False)
    sample = available_df.iloc[random_idx]

    # Combine
    result = pd.concat([sample, correct_rows])
    return result

In [46]:
from pydantic import BaseModel
from typing import Literal

class EvalOutput(BaseModel):
    correct: bool
    reason: Literal['wrong parameters', 'required tool call was not made', 'unnecessary tool call was made', 'this situation did not require a tool call']

In [47]:
from collections import Counter

def check_answer(response, ground_truth: list[dict]) -> tuple[bool, str | None]:
    llm_called_tools = []
    items = []
    ground_truth_names = [key for tool in ground_truth for key in tool.keys()]
    
    for item in response.output: 
        if item.type == "function_call": 
            llm_called_tools.append(item.name) 
            items.append(item)
    correct = Counter(llm_called_tools) == Counter(ground_truth_names)

    if correct: 
        # Compare arguments: ground truth params vs LLM-returned params | Use an LLM to determine correctness for now
        prompt = [
                  {'role': 'system', 'content': 'Evaluate the LLM output and Ground truth for correctness.'},
                  {'role': 'user', 'content': f"Ground Truth: \n {ground_truth} \n\n LLM Output: {items}"}
                 ]
        response = client.responses.parse(
            model="gpt-5-mini",
            input=prompt,
            text_format=EvalOutput
        )
        return response.output_parsed.correct,response.output_parsed.reason
    else: 
        is_subset = Counter(ground_truth_names) <= Counter(llm_called_tools)
        
        if is_subset: #All correct tools WERE called, extraneous tool calls were made.
            return False, 'unnecessary tool call was made'
            
        else:
            return False, 'required tool call was not made'

In [17]:
def run_row(row):
    sample_df = generate_sample(
        s=s,
        correct_tools=row["correct_tool"],
        tool_df=tool_df
    )
    tools = sample_df["oai_format"].tolist()
    max_retries = 5
    for attempt in range(max_retries):
        try: 
            start_time = time.time()
            response = client.responses.create(
                model="gpt-5",
                input=row["question"][0],
                tools=tools,
                tool_choice='required'
            )
            end_time = time.time()
            latency = end_time - start_time
            break
        except Exception as e:

            if "429" in str(e) and attempt < max_retries - 1:
                time.sleep(10)
                continue
            raise ValueError(f"Error: {e} \n\n {tools[2]}")
    correct, reason = check_answer(response, row["ground_truth"])
    return correct, reason, latency, response

In [18]:
results = [None] * len(eval_df)

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(run_row, row): i for i, (_, row) in enumerate(eval_df.iterrows())}
    for future in tqdm(as_completed(futures), total=len(futures)):
        idx = futures[future]
        try:
            results[idx] = future.result()
        except Exception as e:
            results[idx] = (None, f"error: {e}", None, None)

llm_only_results_df = eval_df.copy()
llm_only_results_df["correct"], llm_only_results_df["reason"], llm_only_results_df["latency"], llm_only_results_df["response"] = zip(*results)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1053/1053 [14:14<00:00,  1.23it/s]


In [23]:
llm_only_results_df.reason.value_counts()

reason
wrong parameters                    507
missing required tool call          412
required tool call was not made      67
extraneous unnecessary tool call     52
unnecessary tool call was made       15
Name: count, dtype: int64

In [24]:
llm_only_results_df.to_pickle("data/RESULTS.pkl")

In [25]:
fixed_df = llm_only_results_df

In [27]:
fixed_df.head()

Unnamed: 0,id,question,function,ground_truth,correct_tool,correct,reason,latency,response
0,live_multiple_0-0-0,"[[{'role': 'user', 'content': 'update my latte...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['latte'...,[ChaDri_change_drink],True,missing required tool call,4.166997,Response(id='resp_077bd51d11ae680c006994dfc7c1...
1,live_multiple_1-0-1,"[[{'role': 'system', 'content': 'You are an ag...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['1234']...,[ChaDri_change_drink],False,extraneous unnecessary tool call,4.704834,Response(id='resp_0c7b6d329c71dd8e006994dfc7e1...
2,live_multiple_2-1-0,"[[{'role': 'user', 'content': 'Tôi cần một chu...","[{'name': 'uber_ride', 'description': 'Tìm chu...","[{'uber_ride': {'loc': ['2150 Shattuck Ave, Be...",[uber_ride],False,required tool call was not made,3.723988,Response(id='resp_01fa035c0a605f67006994dfc7c3...
3,live_multiple_3-2-0,"[[{'role': 'user', 'content': 'Get weather of ...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'api_weather': {'loc': ['Ha Noi, Vietnam']}}]",[api_weather],True,extraneous unnecessary tool call,3.624818,Response(id='resp_0a530e07758ed1b8006994dfc7c5...
4,live_multiple_4-2-1,"[[{'role': 'user', 'content': 'Tìm chuyến xe c...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'uber_ride': {'loc': ['123 Hanoi Street'], '...",[uber_ride],False,wrong parameters,6.209589,Response(id='resp_00744d2babd899ef006994dfc7c4...


In [49]:
fixed_df.loc[fixed_df['correct']==True, 'reason'] = "Correct"
fixed_df.loc[fixed_df['reason']=="required tool call was not made", 'reason'] = "missing required tool call"

In [50]:
fixed_df['reason'].value_counts()

reason
wrong parameters                    483
Correct                             459
missing required tool call           88
extraneous unnecessary tool call     23
Name: count, dtype: int64

In [51]:
fixed_df.to_pickle("data/50-tools-gpt-5-mini.pkl")

In [45]:
tool_df.n_tokens.mean() * 50

np.float64(11075.406893250358)