In [12]:
import pandas as pd
import copy
import json
from collections import Counter

In [13]:
df = pd.read_pickle("data/pkls/ntokens_embeddings_tool_df.pkl") 

In [14]:
# Audit: collect all type values from raw parameters and current oai_format
def collect_types(schema, counter=None):
    if counter is None:
        counter = Counter()
    if isinstance(schema, dict):
        if "type" in schema:
            counter[schema["type"]] += 1
        if "properties" in schema:
            for v in schema["properties"].values():
                collect_types(v, counter)
        if "items" in schema and isinstance(schema["items"], dict):
            collect_types(schema["items"], counter)
    return counter

raw_types = Counter()
oai_types = Counter()
for _, row in df.iterrows():
    collect_types(row["parameters"], raw_types)
    collect_types(row["oai_format"].get("parameters", {}), oai_types)

print("Raw parameters types:", raw_types.most_common())
print()
print("Current oai_format types:", oai_types.most_common())

Raw parameters types: [('string', 11867), ('dict', 4241), ('integer', 1982), ('boolean', 1604), ('float', 341), ('array', 244), ('any', 7), ('tuple', 3)]

Current oai_format types: [('string', 11874), ('object', 4241), ('integer', 1982), ('boolean', 1604), ('number', 341), ('array', 247)]


In [15]:
# Type mapping: non-standard → OAI-compatible JSON Schema types
# "any" is handled specially: we remove the type key entirely,
# which in JSON Schema means "unconstrained" (accepts any type)
TYPE_MAP = {
    "dict": "object",
    "float": "number",
    "tuple": "array",
}

def fix_types(schema: dict) -> dict:
    schema = copy.deepcopy(schema)
    t = schema.get("type")
    if t in TYPE_MAP:
        schema["type"] = TYPE_MAP[t]
    elif t == "any":
        del schema["type"]
    if "properties" in schema:
        schema["properties"] = {k: fix_types(v) for k, v in schema["properties"].items()}
    if "items" in schema and isinstance(schema["items"], dict):
        schema["items"] = fix_types(schema["items"])
    return schema

def to_oai_tool(row: dict) -> dict:
    return {
        "type": "function",
        "name": row["name"],
        "description": row["description"],
        "parameters": fix_types(row["parameters"]),
    }

In [16]:
# Rebuild oai_format from raw parameters
df["oai_format"] = df["formatted"].apply(to_oai_tool)
print(f"Rebuilt oai_format for {len(df)} rows")

Rebuilt oai_format for 4178 rows


In [17]:
# Validate: check that no non-standard types remain in oai_format
OAI_VALID_TYPES = {"string", "number", "integer", "boolean", "array", "object", "null"}

def find_bad_types(schema, path=""):
    issues = []
    if isinstance(schema, dict):
        t = schema.get("type")
        if t is not None and t not in OAI_VALID_TYPES:
            issues.append((path or "root", t))
        if "properties" in schema:
            for k, v in schema["properties"].items():
                issues.extend(find_bad_types(v, f"{path}.{k}"))
        if "items" in schema and isinstance(schema["items"], dict):
            issues.extend(find_bad_types(schema["items"], f"{path}.items"))
    return issues

all_issues = []
for i, row in df.iterrows():
    issues = find_bad_types(row["oai_format"].get("parameters", {}))
    if issues:
        all_issues.append((i, row["name"], issues))

if all_issues:
    print(f"FAIL: {len(all_issues)} rows with non-standard types:")
    for idx, name, issues in all_issues[:10]:
        print(f"  Row {idx} ({name}): {issues}")
else:
    print("PASS: All oai_format schemas use valid JSON Schema types")

PASS: All oai_format schemas use valid JSON Schema types


In [18]:
# Spot-check: inspect how "any" type entries were converted
for i, row in df.iterrows():
    params_str = str(row["parameters"])
    if "'type': 'any'" in params_str:
        print(f"Row {i} — {row['name']}")
        print(json.dumps(row["oai_format"]["parameters"], indent=2))
        print()

Row 307 — default_add_default_value
{
  "type": "object",
  "required": [
    "dict",
    "key",
    "default_value"
  ],
  "properties": {
    "dict": {
      "type": "object",
      "description": "The dictionary to which the default value should be added.",
      "properties": {}
    },
    "key": {
      "type": "string",
      "description": "The key for which the default value is to be set."
    },
    "default_value": {
      "description": "The value to set for the key if it does not exist in the dictionary. The type of this value can be any valid Python data type."
    }
  }
}

Row 312 — default_add_default_value
{
  "type": "object",
  "required": [
    "dict",
    "key",
    "default_value"
  ],
  "properties": {
    "dict": {
      "type": "object",
      "description": "The dictionary to which the default value should be added.",
      "properties": {}
    },
    "key": {
      "type": "string",
      "description": "The key for which the default value is to be set."
    }

In [19]:
# Save updated dataframe
df.to_pickle("data/pkls/ntokens_embeddings_tool_df.pkl")
print("Saved to data/pkls/ntokens_embeddings_tool_df.pkl")

Saved to data/pkls/ntokens_embeddings_tool_df.pkl
