### Goal of this notebook: recover false negatives due to formatting issues from BFCL_v4

In [1]:
import pandas as pd

In [2]:
df = pd.read_pickle("data/RESULTS_LLM_ONLY.pkl")
df.head()

Unnamed: 0,id,question,function,ground_truth,correct_tool,correct,reason,latency,response
0,live_multiple_0-0-0,"[[{'role': 'user', 'content': 'update my latte...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['latte'...,[ChaDri_change_drink],,error: Error code: 400 - {'error': {'message':...,,
1,live_multiple_1-0-1,"[[{'role': 'system', 'content': 'You are an ag...","[{'name': 'ChaFod', 'description': 'Changes th...",[{'ChaDri_change_drink': {'drink_id': ['1234']...,[ChaDri_change_drink],,error: Error code: 400 - {'error': {'message':...,,
2,live_multiple_2-1-0,"[[{'role': 'user', 'content': 'Tôi cần một chu...","[{'name': 'uber_ride', 'description': 'Tìm chu...","[{'uber_ride': {'loc': ['2150 Shattuck Ave, Be...",[uber_ride],,error: Error code: 400 - {'error': {'message':...,,
3,live_multiple_3-2-0,"[[{'role': 'user', 'content': 'Get weather of ...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'api_weather': {'loc': ['Ha Noi, Vietnam']}}]",[api_weather],,error: Error code: 400 - {'error': {'message':...,,
4,live_multiple_4-2-1,"[[{'role': 'user', 'content': 'Tìm chuyến xe c...","[{'name': 'uber_ride', 'description': 'Finds a...","[{'uber_ride': {'loc': ['123 Hanoi Street'], '...",[uber_ride],,error: Error code: 400 - {'error': {'message':...,,


In [16]:
import json
from collections import Counter, defaultdict

def _value_matches(llm_value, gt_alternatives):
    """Check if llm_value matches ANY of the acceptable ground-truth alternatives."""
    for acceptable in gt_alternatives:
        if isinstance(acceptable, dict):
            # Nested params dict — recurse
            if isinstance(llm_value, dict) and _dict_matches(llm_value, acceptable):
                return True
        elif isinstance(llm_value, str) and isinstance(acceptable, str):
            if llm_value.strip().lower() == acceptable.strip().lower():
                return True
        else:
            if llm_value == acceptable:
                return True
    return False


def _dict_matches(llm_args, gt_params):
    """Compare LLM args dict against a GT params dict whose values are lists of acceptable alternatives."""
    if set(llm_args.keys()) != set(gt_params.keys()):
        return False
    return all(_value_matches(llm_args[k], gt_params[k]) for k in gt_params)


def flexible_check_answer(response, ground_truth):
    """
    Re-evaluate a response against ground truth using flexible matching:
      - GT values are lists of acceptable alternatives (any match counts)
      - String comparison is case-insensitive and whitespace-trimmed
      - Nested dicts are compared recursively with the same rules
    
    Returns (correct: bool, reason: str | None)
    """
    llm_calls = [
        (item.name, json.loads(item.arguments))
        for item in response.output if item.type == "function_call"
    ]
    gt_calls = [(name, params) for tool in ground_truth for name, params in tool.items()]

    # 1. Check tool names
    llm_names = Counter(name for name, _ in llm_calls)
    gt_names = Counter(name for name, _ in gt_calls)

    if llm_names != gt_names:
        if gt_names <= llm_names:
            return False, 'unnecessary tool call was made'
        return False, 'required tool call was not made'

    # 2. Check params — match each GT call to an LLM call by name
    llm_by_name = defaultdict(list)
    for name, args in llm_calls:
        llm_by_name[name].append(args)

    for name, gt_params in gt_calls:
        candidates = llm_by_name[name]
        matched = False
        for i, llm_args in enumerate(candidates):
            if _dict_matches(llm_args, gt_params):
                candidates.pop(i)
                matched = True
                break
        if not matched:
            return False, 'misparameterized the correct tool call'

    return True, "succeeded"

In [17]:
  df[['flex_correct', 'flex_reason']] = df.apply(
      lambda row: flexible_check_answer(row['response'], row['ground_truth']),
      axis=1, result_type='expand'
  )

In [18]:
df.flex_reason.value_counts()

flex_reason
required tool call was not made           402
succeeded                                 377
misparameterized the correct tool call    259
unnecessary tool call was made             15
Name: count, dtype: int64

In [24]:
sample = df[df['flex_reason'] == 'misparameterized the correct tool call']['response'].iloc[0]
y = df[df['flex_reason'] == 'misparameterized the correct tool call']['ground_truth'].iloc[0]
for item in sample.output:
    if item.type == "function_call": 
        print(item, "\n\n\n", y)

ResponseFunctionToolCall(arguments='{"drink_id":"1234","new_preferences":{"size":"medium","temperature":"hot","sweetness_level":"none","milk_type":"regular","special_instructions":"No sugar; serve hot."}}', call_id='call_0o24qepE0oAks01rp5SdEZwJ', name='ChaDri_change_drink', type='function_call', id='fc_0c55cd9622ba25f1006993e67527fc81938578ee08f2b8a6c0', status='completed') 


 [{'ChaDri_change_drink': {'drink_id': ['1234'], 'new_preferences': [{'sweetness_level': ['none'], 'temperature': ['hot']}]}}]
