# Analysis of the Results

For a given set of tools, automatic test cases are generated. This notebook analyzes the quality of the questions generated along with their diversity and whether they really span the requested combinations of tools.

Note that GPT may generate `functions.<function_name>` instead of just the function name and we account for that.

## Test case generation

We demanded the LLMs to generate test cases including all of their tools. We would like 2 tests per tool (individually) and 2 tests that require exactly 2 of the tools to be answered correctly. Let us verify if this generation was correct.

For each test generation strategy, we want to check:

- Number of test cases generated per tool (individually);
- Number of test cases generated per tool (considering pairs);
- How often the model generated questions that used the tool requested;
- Manually, how many of the questions really need the tools planned.

### Set up and read files

In [None]:
import os
import json
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

import self_test_utils as stu

In [None]:
test_gen_strategies = [
    'use_all',
    'only_selected',
    'selected_with_dummies',
]

In [None]:
test_files = [x for x in os.listdir() if x.endswith('.json')]

In [None]:
def read_generated_tests():
    df_tests = None
    for cur_file in test_files:
        with open(cur_file, 'r') as f:
            contents = json.loads(f.read())
            file_info = cur_file.split('_test_cases_')
            df = pd.DataFrame(contents)
            df['gen_strategy'] = file_info[0]
            df['model'] = file_info[1].replace('.json', '')
            if df_tests is None:
                df_tests = df
            else:
                df_tests = pd.concat([df_tests, df])
    return df_tests


df_tests = read_generated_tests()
df_tests

In [None]:
# compute all tools that were tested
all_tools = [
    x for x in set(df_tests.expected_tool_to_gen_test.dropna()) if ',' not in x
]
all_tools

### Number of test cases generated per tool (individually)

In [None]:
# build a dataframe with boolean flags for each tool
df_tests_per_tool = df_tests.copy()
for t in all_tools:
    df_tests_per_tool[t] = df_tests_per_tool.appropriate_tools.map(lambda z: t in str(z).replace('functions.', ''))

cols = ['model', 'gen_strategy'] + all_tools
df_tests_per_tool[cols].groupby(by=['model', 'gen_strategy']).sum()

In [None]:
df_tests_per_tool[cols].groupby(
    by=['model', 'gen_strategy']
).sum().plot.barh(figsize=(15, 15), title='Number of test cases generated per tool/strategy')
# plt.xticks(rotation = 45)

In [None]:
df_agg = df_tests_per_tool[cols].groupby(by=['model', 'gen_strategy']).sum()
df_agg['coverage'] = np.sum(df_agg[all_tools].values > 0, axis=1) / len(all_tools)
df_agg = df_agg.sort_values(by='coverage', ascending=False)
df_agg[['coverage']].plot.barh(title='Single tool test coverage')

In [None]:
# missing coverage
all_missing = []
for idx, r in df_agg[all_tools].iterrows():
    missing_tools = [x for x in all_tools if r[x] == 0]
    all_missing.append(missing_tools)
df_agg['missing_tools'] = all_missing
df_agg[df_agg.coverage < 1][['coverage', 'missing_tools']]

## Number of test cases generated per tool (considering pairs)

In [None]:
# build a dataframe with boolean flags for tool combinations

all_tools_and_pairs = [[x] for x in all_tools]
for i, t1 in enumerate(all_tools[0:-1]):
    for j, t2 in enumerate(all_tools[i + 1:]):
        all_tools_and_pairs.append([t1, t2])

df_pair_tests_per_tool = df_tests.copy()
df_pair_tests_per_tool['invented_tools'] = df_pair_tests_per_tool.appropriate_tools.map(lambda z: stu.detect_invented_tools(z, all_tools))
tool_cols = [','.join(t) for t in all_tools_and_pairs]
for t in all_tools_and_pairs:
    df_pair_tests_per_tool[','.join(t)] = df_pair_tests_per_tool.appropriate_tools.map(lambda z: stu.belongs_to_col(z, t))

In [None]:
# df_agg.columns

In [None]:
cols = ['model', 'gen_strategy'] + tool_cols
df_pair_tests_per_tool[cols].groupby(by=['model', 'gen_strategy']).sum()

In [None]:
df_agg = df_pair_tests_per_tool[cols].groupby(by=['model', 'gen_strategy']).sum()
df_agg['coverage'] = np.sum(df_agg[tool_cols].values > 0, axis=1) / len(tool_cols)
df_agg = df_agg.sort_values(by='coverage', ascending=False)
df_agg[['coverage']].plot.barh(title='Single and pairwise tool test coverage')

In [None]:
# missing coverage and invented tools
from IPython.display import display, Math, Latex

all_missing = []
for idx, r in df_agg[tool_cols].iterrows():
    missing_tools = [x for x in tool_cols if r[x] == 0]
    all_missing.append(missing_tools)
df_agg['missing_tools'] = all_missing

latex_tbl = df_agg[df_agg.coverage <= 1][['coverage']].to_latex(float_format="{:.2f}".format,)
df_agg[df_agg.coverage <= 1][['coverage', 'missing_tools']]

In [None]:
# export for paper
# print(latex_tbl.replace('_', '\\_').replace(' - Anthropic', '').replace(' - OpenAI', '').replace(' - Bedrock', ''))

In [None]:
df_pair_tests_per_tool[df_pair_tests_per_tool['invented_tools'] != ''].groupby(['model', 'gen_strategy']).count()['question']

In [None]:
# [stu.detect_invented_tools(x) for x in df_tests.appropriate_tools[20:40]]

In [None]:
# set([stu.detect_invented_tools(x) for x in df_tests.appropriate_tools])

## Manual verification

In [None]:
manually_verified_files = [
    'selected_with_dummies_test_cases_GPT 4o - OpenAI.json',
    'selected_with_dummies_test_cases_Claude 3.5 Sonnet - Anthropic.json',
]

In [None]:
def check_human_answers(df, file):
    list_is_correct = []
    list_score = []
    list_invented_tools = []
    list_is_empty_pred = []
    for idx, row in df.iterrows():
        is_correct, score, invented_tools, is_empty_pred = stu.is_tool_selection_correct(
            json.dumps(row['human_verified_tools']),
            json.dumps(row['appropriate_tools']),
            all_tools
        )
        list_is_correct.append(is_correct)
        list_score.append(score)
        list_invented_tools.append(invented_tools)
        list_is_empty_pred.append(is_empty_pred)
    df['is_correct'] = list_is_correct
    df['score'] = list_score
    df['invented_tools'] = list_invented_tools
    df['is_empty_pred'] = list_is_empty_pred
    df['file'] = file
    return df

In [None]:
manual_verif_dfs = None
for cur_file in manually_verified_files:
    with open(cur_file, 'r') as f:
        cur_data = json.loads(f.read())
    df = pd.DataFrame(cur_data)
    check_human_answers(df, cur_file)
    if manual_verif_dfs is None:
        manual_verif_dfs = df
    else:
        manual_verif_dfs = pd.concat([manual_verif_dfs, df])

In [None]:
manual_verif_dfs.head(2)

In [None]:
manual_verif_dfs.groupby(by=['file']).agg(
    {
        'question': ['count'],
        'is_correct': ['mean'],
        'score': ['mean'],
        'is_empty_pred': ['sum'],
        # 'n_invented_tools': ['sum'],
    }
)

# Evaluation

Make sure to test when

- No tools were planned
- Tools invoked when we explicitly asked not to
- Tool names were made up
- Correct tools planned
- Only one of the correct tools was planned

Note that we only evaluate answers given in valid JSON format enclosed within \<answer>\</answer> tags, as requested in the prompt.

In [None]:
eval_files = [x for x in os.listdir() if x.startswith('self_test_results')]
eval_files

In [None]:
df_eval_models = None
for cur_eval_file in eval_files:
    df = pd.read_csv(cur_eval_file)
    if df_eval_models is None:
        df_eval_models = df
    else:
        df_eval_models = pd.concat([df_eval_models, df])

# evaluate accuracy
is_correct = []
invented_tools = []
scores = []
is_pred_empty = []
for idx, r in df_eval_models.iterrows():
    cur_correct, cur_score, cur_invented, cur_empty = stu.is_tool_selection_correct(
        r['expected_answer'], r['parsed_tool_names'], all_tools, r['model']
    )
    is_correct.append(cur_correct)
    invented_tools.append(cur_invented)
    scores.append(cur_score)
    is_pred_empty.append(cur_empty)
df_eval_models['scores'] = scores
df_eval_models['is_correct'] = is_correct
df_eval_models['n_invented_tools'] = [len(x) for x in invented_tools]
df_eval_models['invented_tools'] = invented_tools
df_eval_models['is_pred_empty'] = is_pred_empty

In [None]:
df_eval_models

In [None]:
agg_eval = df_eval_models.groupby(by=['model', 'use_native_tools']).agg(
    {
        'question': ['count'],
        'is_correct': ['sum'],
        'scores': ['sum'],
        'is_pred_empty': ['sum'],
        'n_invented_tools': ['sum'],
    }
)
agg_eval[('accuracy', '%')] = agg_eval.values[:,1] / agg_eval.values[:,0]
agg_eval[('score', '%')] = agg_eval.values[:,2] / agg_eval.values[:,0]
agg_eval = agg_eval.sort_values(by=('score', '%'), ascending=False)
agg_eval

In [None]:
import sys
sys.path.append('..')

import gat_llm.llm_invoker as inv

# get model costs
inp_prices = []
out_prices = []
for x in agg_eval.index:
    print(x[0])
    cur_llm = inv.LLM_Provider.get_llm(None, x[0])
    inp_prices.append(cur_llm.price_per_M_input_tokens)
    out_prices.append(cur_llm.price_per_M_output_tokens)
agg_eval[("USD / 1M tokens", "Input")] = inp_prices
agg_eval[("USD / 1M tokens", "Output")] = out_prices

cols_of_interest = [
            ('n_invented_tools',              'sum'),
            (        'accuracy',                '%'),
            (           'score',                '%'),
            ("USD / 1M tokens",  'Input'),
            ("USD / 1M tokens", 'Output')
]

agg_eval = agg_eval[cols_of_interest].copy()

agg_eval[('accuracy','%')] = agg_eval[('accuracy','%')].map(lambda z: np.round(100 * z, 1))
agg_eval[('score','%')] = agg_eval[('score','%')].map(lambda z: np.round(100 * z, 1))

agg_eval

In [None]:
# export for readme
markdown_tbl = agg_eval.to_markdown()
print(markdown_tbl)

In [None]:
# export for paper
latex_tbl = agg_eval.to_latex(float_format="{:.2f}".format,)
print(latex_tbl.replace('_', '\\_').replace(' - Anthropic', '').replace(' - OpenAI', '').replace(' - Bedrock', ''))

In [None]:
df_eval_models[(df_eval_models.model == 'GPT 4o - OpenAI') & (df_eval_models.n_invented_tools > 0)].iloc[0].to_dict()