In [19]:
import pandas as pd 
import numpy as np
import os
from tqdm import tqdm 
from openai import OpenAI

In [20]:
client = OpenAI()

In [21]:
tool_df = pd.read_pickle("data/tool_dataframe.pkl")
df = pd.read_pickle("data/multiple_tools.pkl")

In [22]:
df.head()

Unnamed: 0,id,question,function,ground_truth
0,live_multiple_0-0-0,"[[{'role': 'user', 'content': 'update my latte...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['latte'...
1,live_multiple_1-0-1,"[[{'role': 'system', 'content': 'You are an ag...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['1234']...
2,live_multiple_2-1-0,"[[{'role': 'user', 'content': 'Tôi cần một chu...","[{'name': 'uber_ride', 'description': 'Tìm chu...","[{'uber_ride': {'loc': ['2150 Shattuck Ave, Be..."
3,live_multiple_3-2-0,"[[{'role': 'user', 'content': 'Get weather of ...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'api_weather': {'loc': ['Ha Noi, Vietnam']}}]"
4,live_multiple_4-2-1,"[[{'role': 'user', 'content': 'Tìm chuyến xe c...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'uber_ride': {'loc': ['123 Hanoi Street'], '..."


In [23]:
tool_df.head()

Unnamed: 0,name,description,parameters
0,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr..."
1,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['new_preferences..."
2,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr..."
3,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['drink_id', 'new..."
4,uber_ride,Tìm chuyến đi phù hợp cho khách hàng dựa trên ...,"{'type': 'dict', 'required': ['loc', 'type', '..."


### Embedding Tool names & descriptions:
Simple Operation: (tool_df.name + \n + tool_df.description) and embed via OAI embedding-3-large

In [44]:
tool_df['embedding_input'] = "Name: " + tool_df['name'] + "\n" + "Description: " + tool_df['description']

In [54]:
def batch(l, n): 
    for i in range(0, len(l), n): 
        yield l[i:i+n]

In [61]:
embeddings = []

In [66]:
for x_b in tqdm(batch(tool_df['embedding_input'].tolist(), 100), desc="Embedding..."): 
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=x_b
    )
    embeddings.extend([item.embedding for item in response.data])


Embedding...: 42it [00:37,  1.13it/s]


In [69]:
tool_df['embeddings'] = embeddings

In [71]:
tool_df.head()
tool_df.to_pickle("data/embeddings_tool_df.pkl")

### Count Tokens

In [24]:
import tiktoken 
tool_df['formatted'] = tool_df.apply(lambda row: row.loc[['name', 'description', 'parameters']].to_dict(), axis=1)
tool_df

Unnamed: 0,name,description,parameters,formatted
0,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr...","{'name': 'ChaFod', 'description': 'Changes the..."
1,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['new_preferences...","{'name': 'ChaDri_change_drink', 'description':..."
2,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr...","{'name': 'ChaFod', 'description': 'Changes the..."
3,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['drink_id', 'new...","{'name': 'ChaDri_change_drink', 'description':..."
4,uber_ride,Tìm chuyến đi phù hợp cho khách hàng dựa trên ...,"{'type': 'dict', 'required': ['loc', 'type', '...","{'name': 'uber_ride', 'description': 'Tìm chuy..."
...,...,...,...,...
4173,Weather_1_GetWeather,Retrieves the weather information for a specif...,"{'type': 'dict', 'required': ['city'], 'proper...","{'name': 'Weather_1_GetWeather', 'description'..."
4174,set_alarm,Set an alarm for a specific time. The time can...,"{'type': 'dict', 'required': ['alarm_time'], '...","{'name': 'set_alarm', 'description': 'Set an a..."
4175,set_countdown,Sets a countdown timer for a specified duratio...,"{'type': 'dict', 'required': ['duration'], 'pr...","{'name': 'set_countdown', 'description': 'Sets..."
4176,set_volume,Set the global volume for all audio playback. ...,"{'type': 'dict', 'required': ['volume'], 'prop...","{'name': 'set_volume', 'description': 'Set the..."


In [74]:
encoding = tiktoken.encoding_for_model("gpt-5")
assert encoding.decode(encoding.encode("hello world")) == "hello world" # Checks if model works 

In [75]:
tool_df['n_tokens'] = tool_df['formatted'].apply(lambda x: len(encoding.encode(str(x))))
tool_df["name"] = tool_df["name"].str.replace(".", "_", regex=False)

In [76]:
tool_df.head()

Unnamed: 0,name,description,parameters,formatted,embeddings,n_tokens
0,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr...","{'name': 'ChaFod', 'description': 'Changes the...","[-0.02990127168595791, -0.012279253453016281, ...",192
1,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['new_preferences...","{'name': 'ChaDri_change_drink', 'description':...","[0.001195563469082117, -0.008156399242579937, ...",332
2,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr...","{'name': 'ChaFod', 'description': 'Changes the...","[-0.02990127168595791, -0.012279253453016281, ...",192
3,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['drink_id', 'new...","{'name': 'ChaDri_change_drink', 'description':...","[0.001195563469082117, -0.008156399242579937, ...",324
4,uber_ride,Tìm chuyến đi phù hợp cho khách hàng dựa trên ...,"{'type': 'dict', 'required': ['loc', 'type', '...","{'name': 'uber_ride', 'description': 'Tìm chuy...","[-0.007943605072796345, -0.03746823966503143, ...",190


In [89]:
def to_oai_tool(tool: dict) -> dict:
      TYPE_MAP = {"dict": "object", "float": "number", "any": "string", "tuple": "array"}

      def fix_types(schema: dict) -> dict:
          schema = schema.copy()
          if schema.get("type") in TYPE_MAP:
              schema["type"] = TYPE_MAP[schema["type"]]
          if "properties" in schema:
              schema["properties"] = {
                  k: fix_types(v) for k, v in schema["properties"].items()
              }
          if "items" in schema and isinstance(schema["items"], dict):
              schema["items"] = fix_types(schema["items"])
          return schema

      return {
          "type": "function",
          "name": tool["name"],
          "description": tool["description"],
          "parameters": fix_types(tool["parameters"]),
      }

In [90]:
tool_df['oai_format'] = tool_df['formatted'].apply(to_oai_tool)
tool_df.head()

Unnamed: 0,name,description,parameters,formatted,embeddings,n_tokens,oai_format
0,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr...","{'name': 'ChaFod', 'description': 'Changes the...","[-0.02990127168595791, -0.012279253453016281, ...",192,"{'type': 'function', 'name': 'ChaFod', 'descri..."
1,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['new_preferences...","{'name': 'ChaDri_change_drink', 'description':...","[0.001195563469082117, -0.008156399242579937, ...",332,"{'type': 'function', 'name': 'ChaDri_change_dr..."
2,ChaFod,Changes the food item based on the customer's ...,"{'type': 'dict', 'required': ['foodItem'], 'pr...","{'name': 'ChaFod', 'description': 'Changes the...","[-0.02990127168595791, -0.012279253453016281, ...",192,"{'type': 'function', 'name': 'ChaFod', 'descri..."
3,ChaDri_change_drink,Modifies the existing drink order to accommoda...,"{'type': 'dict', 'required': ['drink_id', 'new...","{'name': 'ChaDri_change_drink', 'description':...","[0.001195563469082117, -0.008156399242579937, ...",324,"{'type': 'function', 'name': 'ChaDri_change_dr..."
4,uber_ride,Tìm chuyến đi phù hợp cho khách hàng dựa trên ...,"{'type': 'dict', 'required': ['loc', 'type', '...","{'name': 'uber_ride', 'description': 'Tìm chuy...","[-0.007943605072796345, -0.03746823966503143, ...",190,"{'type': 'function', 'name': 'uber_ride', 'des..."


In [91]:
tool_df.n_tokens.describe()
tool_df.to_pickle('data/ntokens_embeddings_tool_df.pkl')