# Evaluate Fine-tuned Model

Code authored by: Shaw Talebi

### imports

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
from utils.tool_calling import parse_tool_call, call_tool
import pandas as pd

### load data

In [2]:
# load dataset
ds = load_dataset("shawhin/tool-use-finetuning")
ds_test = ds['test']

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/800k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/95.1k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/105k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/477 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/60 [00:00<?, ? examples/s]

### load models

In [3]:
# load base model
model_name = "google/gemma-3-1b-it"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="mps",
)

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# load fine-tuned model
finetuned_model_name = "shawhin/gemma-3-1b-tool-use"

finetuned_model = AutoModelForCausalLM.from_pretrained(
    finetuned_model_name,
    device_map="mps",
)

adapter_config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

### generate responses using models

In [5]:
def evaluate_model_tool_calling(generator, row):
    """
    Evaluates whether the model correctly identifies the need for a tool,
    calls the correct tool, and executes it successfully.
    """
    # Extract values from row
    messages = row['trace']
    expected_tool_name = row.get('tool_name', None)

    # Auto-infer if a tool is needed
    tool_needed = expected_tool_name is not None and str(expected_tool_name).lower() != 'nan'

    # Set system role on first message
    messages = messages.copy()
    messages[0]['role'] = 'system'

    # Generate assistant output
    output = generator(messages[:2], return_full_text=True)[0]
    response = output['generated_text'][-1]

    # Initialize tracking flags
    model_called_tool = False
    model_tool_name = None
    model_called_correct_tool = False
    tool_call_success = False

    # Check if model issued a tool call
    if "<tool_call>" in response['content']:
        model_called_tool = True
        parsed_result = parse_tool_call(response['content'])

        if parsed_result is not None:
            tool_name, tool_args = parsed_result
            model_tool_name = tool_name

            model_called_correct_tool = (tool_name == expected_tool_name)

            try:
                result = call_tool(tool_name, tool_args)
                tool_call_success = True
            except:
                tool_call_success = False
        else:
            # tool_call marker present but malformed
            model_tool_name = None
            model_called_correct_tool = False
            tool_call_success = False

    # Final evaluation: did the model call a tool when it was needed?
    if tool_needed:
        model_called_tool_when_needed = model_called_tool
    else:
        model_called_tool_when_needed = not model_called_tool

    return {
        'response': response,
        'model_called_tool': model_called_tool,
        'model_tool_name': model_tool_name,
        'model_called_tool_when_needed': model_called_tool_when_needed,
        'model_called_correct_tool': model_called_correct_tool,
        'tool_call_success': tool_call_success,
    }


In [6]:
# create pipelines
base_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, temperature=0.1)
finetuned_generator = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer, temperature=0.1)

Device set to use mps
Device set to use mps


In [7]:
%%time
results_data = []

for i, row in enumerate(ds_test):
    print("Evaluating row:", i)
    # Generate base model results
    base_results = evaluate_model_tool_calling(base_generator, row)
    result_row = {
        'model_name': model_name,
        'query': row['query'],
        'query_type': row['query_type'],
        'num_tools_available': row['num_tools_available'],
        'expected_tool_name': row['tool_name'],
        **base_results
    }
    results_data.append(result_row)

    # Generate fine-tuned model results
    finetuned_results = evaluate_model_tool_calling(finetuned_generator, row)
    finetuned_result_row = {
        'model_name': finetuned_model_name,
        'query': row['query'],
        'query_type': row['query_type'],
        'num_tools_available': row['num_tools_available'],
        'expected_tool_name': row['tool_name'],
        **finetuned_results
    }
    results_data.append(finetuned_result_row)

Evaluating row: 0
Evaluating row: 1
Evaluating row: 2
Evaluating row: 3
Evaluating row: 4
Evaluating row: 5
Evaluating row: 6
Evaluating row: 7
Evaluating row: 8
Evaluating row: 9
Evaluating row: 10
Evaluating row: 11
Evaluating row: 12
Evaluating row: 13
Evaluating row: 14
Evaluating row: 15
Evaluating row: 16
Evaluating row: 17
Evaluating row: 18
Evaluating row: 19
Evaluating row: 20
Evaluating row: 21
Evaluating row: 22
Evaluating row: 23
Evaluating row: 24
Evaluating row: 25
Evaluating row: 26
Evaluating row: 27
Evaluating row: 28
Evaluating row: 29
Evaluating row: 30
Evaluating row: 31
Evaluating row: 32
Evaluating row: 33
Evaluating row: 34
Evaluating row: 35
Evaluating row: 36
Evaluating row: 37
Evaluating row: 38
Evaluating row: 39
Evaluating row: 40
Evaluating row: 41
Evaluating row: 42
Evaluating row: 43
Evaluating row: 44
Evaluating row: 45
Evaluating row: 46
Evaluating row: 47
Evaluating row: 48
Evaluating row: 49
Evaluating row: 50
Evaluating row: 51
Evaluating row: 52
Eva

In [8]:
results_df = pd.DataFrame(results_data)

In [9]:
# write results to file
results_df.to_csv('data/eval_results.csv', index=False)

In [10]:
results_df.head()

Unnamed: 0,model_name,query,query_type,num_tools_available,expected_tool_name,response,model_called_tool,model_tool_name,model_called_tool_when_needed,model_called_correct_tool,tool_call_success
0,google/gemma-3-1b-it,Hy... can u pls tell me how many days is it be...,hard,23,calculate_date_diff,"{'role': 'assistant', 'content': '```xml <tool...",True,calculate_date_diff,True,True,False
1,shawhin/gemma-3-1b-tool-use,Hy... can u pls tell me how many days is it be...,hard,23,calculate_date_diff,"{'role': 'assistant', 'content': '<tool_call> ...",True,calculate_date_diff,True,True,True
2,google/gemma-3-1b-it,What are some affordable ways to discover new ...,no_tool,37,,"{'role': 'assistant', 'content': '```xml <fina...",False,,True,False,False
3,shawhin/gemma-3-1b-tool-use,What are some affordable ways to discover new ...,no_tool,37,,"{'role': 'assistant', 'content': '<final_answe...",False,,True,False,False
4,google/gemma-3-1b-it,What is the current date and time right now?,easy,20,get_current_datetime,"{'role': 'assistant', 'content': '```xml <tool...",True,get_current_datetime,True,True,True


### evaluate models

In [None]:
def compare_model_performance(results_df):
    """
    Compare model performance based on tool calling metrics.
    
    Parameters:
    results_df (pd.DataFrame): DataFrame containing evaluation results with columns:
        - model_name
        - model_called_tool_when_needed
        - model_called_correct_tool
        - tool_call_success
        - expected_tool_name
    
    Returns:
    pd.DataFrame: Performance comparison with models as rows and metrics as columns
    """
    
    # Filter for rows where a tool call was needed (expected_tool_name is not None/NaN)
    tool_needed_rows = results_df[results_df['expected_tool_name'].notna() & 
                                 (results_df['expected_tool_name'] != 'nan')]
    
    # Group by model name and calculate pass rates
    performance_metrics = results_df.groupby('model_name').agg({
        'model_called_tool_when_needed': 'mean',
    }).round(4)
    
    # Calculate metrics only for rows where tool was needed
    if len(tool_needed_rows) > 0:
        tool_metrics = tool_needed_rows.groupby('model_name').agg({
            'model_called_correct_tool': 'mean',
            'tool_call_success': 'mean'
        }).round(4)
        
        # Combine the metrics
        performance_metrics = pd.concat([performance_metrics, tool_metrics], axis=1)
    else:
        # If no rows need tools, set these metrics to None
        performance_metrics['model_called_correct_tool'] = None
        performance_metrics['tool_call_success'] = None
    
    # Rename columns for clarity
    performance_metrics.columns = [
        'Tool Called When Needed (%)',
        'Correct Tool Called (%)', 
        'Tool Call Success (%)'
    ]
    
    # Convert to percentages
    performance_metrics = performance_metrics * 100
    
    return performance_metrics

In [12]:
def compare_model_performance_confusion_matrix(results_df):
    """
    Compare model performance based on tool calling metrics.
    
    Parameters:
    results_df (pd.DataFrame): DataFrame containing evaluation results with columns:
        - model_name
        - model_called_tool_when_needed_tp
        - model_called_tool_when_needed_tn
        - model_called_tool_when_needed_fn
        - model_called_tool_when_needed_fp
        - model_called_correct_tool
    
    Returns:
    pd.DataFrame: Performance comparison with models as rows and metrics as columns
    """
    
    # Filter for rows where a tool call was needed (expected_tool_name is not None/NaN)
    tool_needed_rows = results_df[results_df['expected_tool_name'].notna() & 
                                 (results_df['expected_tool_name'] != 'nan')]

    # Calculate TP, TN, FP, FN for tool needed
    tool_needed_rows['tp'] = (tool_needed_rows['model_tool_name'] == tool_needed_rows['expected_tool_name']).astype(int)
    tool_needed_rows['fn'] = ((tool_needed_rows['model_tool_name'].isna()) & 
                              (tool_needed_rows['expected_tool_name'].notna())).astype(int) 
    tool_needed_rows['tn'] = (tool_needed_rows['model_tool_name'].isna() & tool_needed_rows['expected_tool_name'].isna()).astype(int)
    tool_needed_rows['fp'] = ((tool_needed_rows['model_tool_name'].notna()) & 
                              (tool_needed_rows['expected_tool_name'].isna())).astype(int)

    # Calculate metrics only for rows where tool was needed
    confusion_metrics = tool_needed_rows.groupby('model_name').agg({
        'tp': 'sum',
        'tn': 'sum',
        'fp': 'sum',
        'fn': 'sum'
    })
    confusion_metrics.columns = [
        'True Positives',
        'True Negatives', 
        'False Positives',
        'False Negatives'
    ]
    return confusion_metrics

In [23]:
import pandas as pd
import numpy as np
def model_tool_classification_performance_manual(results_df):
    """
    Calculates per-class and aggregated TP, TN, FP, FN and derived metrics
    for each model, without using sklearn.

    Parameters:
    results_df (pd.DataFrame): DataFrame containing evaluation results with columns:
        - model_name
        - expected_tool_name (str or None/NaN)
        - model_tool_name (str or None/NaN)

    Returns:
    dict: A dictionary where keys are model names. Each value is a DataFrame
          indexed by class name (plus 'MICRO_AVG', 'MACRO_AVG'), with columns
          for TP, TN, FP, FN, Precision, Recall, and F1.
    """
    df = results_df.copy()

    # Standardize missing values to 'NO_TOOL' string
    df['expected_tool_name'] = df['expected_tool_name'].fillna('NO_TOOL')
    df['model_tool_name'] = df['model_tool_name'].fillna('NO_TOOL')

    all_labels = sorted(list(set(df['expected_tool_name']) | set(df['model_tool_name'])))
    model_performance = {}

    for model in df['model_name'].unique():
        model_df = df[df['model_name'] == model]
        y_true = model_df['expected_tool_name'].values
        y_pred = model_df['model_tool_name'].values
        n_samples = len(y_true)

        if n_samples == 0:
            continue

        per_class_metrics = []
        for label in all_labels:
            is_true_label = (y_true == label)
            is_pred_label = (y_pred == label)

            tp = np.sum(is_true_label & is_pred_label)
            fp = np.sum(~is_true_label & is_pred_label)
            fn = np.sum(is_true_label & ~is_pred_label)
            tn = np.sum(~is_true_label & ~is_pred_label)

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            per_class_metrics.append({
                'Class': label,
                'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
                'Precision': precision, 'Recall': recall, 'F1': f1
            })

        metrics_df = pd.DataFrame(per_class_metrics).set_index('Class')

        # --- Aggregated Metrics ---
        # Micro-Averaged Metrics (Reflects Overall Accuracy)
        correct_predictions = np.sum(y_true == y_pred)
        accuracy = correct_predictions / n_samples
        micro_avg_row = {
            'TP': correct_predictions,
            'TN': np.nan,  # TN is not well-defined for micro average in this way
            'FP': n_samples - correct_predictions,
            'FN': n_samples - correct_predictions,
            'Precision': accuracy,
            'Recall': accuracy,
            'F1': accuracy
        }

        # Macro-Averaged Metrics
        macro_precision = metrics_df['Precision'].mean()
        macro_recall = metrics_df['Recall'].mean()
        macro_f1 = metrics_df['F1'].mean()
        macro_avg_row = {
            'TP': np.nan, 'TN': np.nan, 'FP': np.nan, 'FN': np.nan,
            'Precision': macro_precision,
            'Recall': macro_recall,
            'F1': macro_f1
        }

        # Append aggregated rows
        agg_df = pd.DataFrame([micro_avg_row, macro_avg_row], index=['MICRO_AVG', 'MACRO_AVG'])
        metrics_df = pd.concat([metrics_df, agg_df])

        model_performance[model] = metrics_df

    return model_performance

In [24]:
eval_summary = model_tool_classification_performance_manual(results_df)
eval_summary 

{'google/gemma-3-1b-it':                           TP    TN    FP    FN  Precision    Recall        F1
 NO_TOOL                 17.0  37.0   3.0   3.0   0.850000  0.850000  0.850000
 Wikipedia                0.0  59.0   1.0   0.0   0.000000  0.000000  0.000000
 add_to_todo_list         1.0  58.0   1.0   0.0   0.500000  1.000000  0.666667
 calculate_average        0.0  59.0   1.0   0.0   0.000000  0.000000  0.000000
 calculate_date_diff      1.0  59.0   0.0   0.0   1.000000  1.000000  1.000000
 calculate_min_max        0.0  59.0   1.0   0.0   0.000000  0.000000  0.000000
 calculate_std_dev        0.0  60.0   0.0   0.0   0.000000  0.000000  0.000000
 calculate_tax            0.0  59.0   1.0   0.0   0.000000  0.000000  0.000000
 calculator               2.0  58.0   0.0   0.0   1.000000  1.000000  1.000000
 char_count               1.0  59.0   0.0   0.0   1.000000  1.000000  1.000000
 compute_median           1.0  58.0   1.0   0.0   0.500000  1.000000  0.666667
 compute_min_max          0.

In [28]:
import pandas as pd
import numpy as np

def model_tool_detection_counts(results_df):
    """
    Calculates the confusion matrix components (TP, TN, FP, FN) for each model
    based on the binary task of detecting WHETHER a tool call is needed.
    This does NOT consider if the correct tool was called, only IF a tool was called.

    Positive Class: Tool call is expected.
    Negative Class: No tool call is expected.

    TP: Expected a tool, Model called any tool.
    TN: Expected NO tool, Model called NO tool.
    FP: Expected NO tool, Model called any tool.
    FN: Expected a tool, Model called NO tool.

    Parameters:
    results_df (pd.DataFrame): DataFrame containing evaluation results with columns:
        - model_name
        - expected_tool_name (str or None/NaN)
        - model_tool_name (str or None/NaN)

    Returns:
    pd.DataFrame: DataFrame indexed by model_name, with columns for TP, TN, FP, FN
                  for the task of tool detection.
    """
    df = results_df.copy()

    # Handle None/NaN
    df['expected_tool_name'] = df['expected_tool_name'].fillna('NO_TOOL')
    df['model_tool_name'] = df['model_tool_name'].fillna('NO_TOOL')

    # Binary flags for whether a tool was expected or called
    expected_positive = (df['expected_tool_name'] != 'NO_TOOL')
    predicted_positive = (df['model_tool_name'] != 'NO_TOOL')

    # Calculate TP, TN, FP, FN for Tool Detection
    df['TP'] = (expected_positive & predicted_positive).astype(int)
    df['FN'] = (expected_positive & ~predicted_positive).astype(int)
    df['FP'] = (~expected_positive & predicted_positive).astype(int)
    df['TN'] = (~expected_positive & ~predicted_positive).astype(int)

    # Aggregate results per model
    metrics = df.groupby('model_name').agg(
        TP=('TP', 'sum'),
        TN=('TN', 'sum'),
        FP=('FP', 'sum'),
        FN=('FN', 'sum')
    ) # .reset_index().set_index('model_name')

    # Calculate derived metrics
    tp = metrics['TP']
    tn = metrics['TN']
    fp = metrics['FP']
    fn = metrics['FN']
    total = tp + tn + fp + fn

    metrics['Accuracy'] = (tp + tn) / total
    metrics['Precision'] = tp / (tp + fp)
    metrics['Recall'] = tp / (tp + fn)
    metrics['F1'] = 2 * (metrics['Precision'] * metrics['Recall']) / (metrics['Precision'] + metrics['Recall'])

    # Handle division by zero cases by filling NaN with 0
    metrics = metrics.fillna(0)

    return metrics

In [29]:
eval_summary = model_tool_detection_counts(results_df)
eval_summary 

Unnamed: 0_level_0,TP,TN,FP,FN,Accuracy,Precision,Recall,F1
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
google/gemma-3-1b-it,37,17,3,3,0.9,0.925,0.925,0.925
shawhin/gemma-3-1b-tool-use,36,20,0,4,0.933333,1.0,0.9,0.947368


Okay, let's break down these results for the two models, `google/gemma-3-1b-it` and `shawhin/gemma-3-1b-tool-use`, based on the "Tool Detection" task (i.e., how well they predict *whether* any tool needs to be called).

**Total Samples:** For each model, the total number of samples is TP + TN + FP + FN.
*   `google/gemma-3-1b-it`: 37 + 17 + 3 + 3 = 60 samples
*   `shawhin/gemma-3-1b-tool-use`: 36 + 20 + 0 + 4 = 60 samples

**Understanding the Metrics:**

*   **TP (True Positives):** Cases where the model correctly predicted a tool was needed.
*   **TN (True Negatives):** Cases where the model correctly predicted NO tool was needed.
*   **FP (False Positives):** Cases where the model incorrectly predicted a tool was needed (Type I error).
*   **FN (False Negatives):** Cases where the model incorrectly predicted NO tool was needed (Type II error).
*   **Accuracy:** Overall percentage of correct predictions (both tool needed and not needed). (TP+TN)/Total
*   **Precision:** Of all the times the model said "a tool is needed", what proportion was correct? TP/(TP+FP)
*   **Recall:** Of all the times a tool *was actually* needed, what proportion did the model catch? TP/(TP+FN)
*   **F1-Score:** Harmonic mean of Precision and Recall, balancing both.

**Analysis of `google/gemma-3-1b-it`:**

*   **TP: 37:** Correctly identified the need for a tool 37 times.
*   **TN: 17:** Correctly identified that no tool was needed 17 times.
*   **FP: 3:** Incorrectly called a tool when none was expected 3 times. This means the model sometimes over-triggers.
*   **FN: 3:** Missed the need for a tool 3 times, saying no tool was needed when one was.
*   **Accuracy: 0.900 (90%)**: Generally good overall performance.
*   **Precision: 0.925 (92.5%)**: When it decided to call a tool, it was correct 92.5% of the time (37 out of 40 times).
*   **Recall: 0.925 (92.5%)**: It successfully found 92.5% of all instances where a tool was actually required (37 out of 40 instances).
*   **F1-Score: 0.925**: Balanced performance between Precision and Recall.

**Analysis of `shawhin/gemma-3-1b-tool-use`:**

*   **TP: 36:** Correctly identified the need for a tool 36 times.
*   **TN: 20:** Correctly identified that no tool was needed 20 times. Excellent specificity.
*   **FP: 0:** *Never* called a tool when none was expected. This model is very conservative about triggering tools, only doing so when confident.
*   **FN: 4:** Missed the need for a tool 4 times. This is slightly more than the other model.
*   **Accuracy: 0.933 (93.3%)**: Higher overall accuracy than the Google model.
*   **Precision: 1.000 (100%)**: Perfect precision! Every time this model decided to call a tool, it was correct in needing one. This is due to having zero False Positives.
*   **Recall: 0.900 (90%)**: It found 90% of the instances where a tool was needed (36 out of 40 instances). This is slightly lower than the Google model's recall.
*   **F1-Score: 0.947**: Very high F1, indicating excellent overall performance in tool detection, slightly better than the Google model, driven by perfect precision.

**Comparison and Key Differences:**

*   **Overall Accuracy:** `shawhin/gemma-3-1b-tool-use` is slightly more accurate overall.
*   **False Positives:** The most striking difference is the FP rate. The `shawhin` model has 0 FPs, meaning it never cries wolf. The `google` model has 3 FPs, so it sometimes calls tools unnecessarily.
*   **False Negatives:** The `shawhin` model has slightly more FNs (4 vs. 3), meaning it's slightly more likely to fail to call a tool when one is needed compared to the `google` model.
*   **Precision vs. Recall Trade-off:**
    *   `shawhin/gemma-3-1b-tool-use` maximizes **Precision** (at 100%) at the cost of slightly lower Recall. It's cautious and only acts when certain.
    *   `google/gemma-3-1b-it` has a more balanced Precision and Recall, both at 92.5%.

**Which model is better?**

It depends on the cost of errors:

*   If **False Positives are costly** (e.g., unnecessary API calls, wasted resources, annoying the user with irrelevant tool actions), the `shawhin/gemma-3-1b-tool-use` model is superior due to its perfect Precision.
*   If **False Negatives are more costly** (e.g., failing to answer a question that requires a tool, missing critical actions), the `google/gemma-3-1b-it` model has a slight edge in Recall, missing one less case.

Given the higher F1-score and perfect Precision, `shawhin/gemma-3-1b-tool-use` appears to be the better model for *deciding when to trigger a tool*, assuming the slightly higher chance of missing a required tool call is acceptable.


What are the actions we can take to increase accuracy, precision, and recall?


Okay, let's discuss actions to potentially improve Accuracy, Precision, and Recall for these tool-calling models, based on the type of errors observed (False Positives and False Negatives).

**General Strategies:**

1.  **More/Better Training Data:**
    *   **More Data:** Simply increasing the volume of high-quality training data can help the model learn more robust patterns.
    *   **Targeted Data Augmentation:**
        *   **To Reduce False Negatives (Improve Recall):** Add more examples similar to the ones the model missed (where `expected_tool_name` was not None, but `model_tool_name` was None). This includes more diverse phrasing, edge cases, and subtle cues that indicate a tool is needed.
        *   **To Reduce False Positives (Improve Precision):** Add more "hard negative" examples. These are queries that *seem* like they might need a tool but actually don't. This helps the model learn the boundaries better. For instance, questions *about* tools vs. questions *requiring* tools.
    *   **Data Quality:** Ensure labels (`expected_tool_name`) are accurate and consistent. Noise in labels can confuse the model.

2.  **Model Architecture/Size:**
    *   Experiment with larger versions of the Gemma models if available and computationally feasible. Larger models often have a greater capacity to learn nuances.
    *   If fine-tuning, explore different base models.

3.  **Fine-tuning Strategies:**
    *   **Hyperparameter Tuning:** Adjust learning rate, batch size, number of epochs, weight decay, etc.
    *   **Loss Function:** While standard cross-entropy is common, explore techniques like focal loss if there's a class imbalance issue (though the TN count suggests "NO_TOOL" isn't rare).
    *   **Curriculum Learning:** Start with easier examples and gradually introduce harder ones.

4.  **Prompt Engineering (If applicable):**
    *   If the model call is influenced by a system prompt, refine the prompt to give clearer instructions about when to call tools and when not to. Include examples of true positives, true negatives, false positives, and false negatives in the prompt's few-shot examples.
    *   Clearly define the scope of each tool.

**Specific Actions Based on Observed Errors:**

*   **To Improve `google/gemma-3-1b-it`'s Precision (Reduce its 3 FPs):**
    *   **Analyze FP Examples:** Look at the 3 queries where it called a tool but shouldn't have. What makes them tricky?
    *   **Add Hard Negatives:** Create training examples that resemble these FP cases but are correctly labeled as 'NO_TOOL'. This teaches the model to be more discerning.
    *   **Refine Tool Descriptions:** Ensure the model understands the limitations of each tool, so it doesn't try to apply them too broadly.

*   **To Improve `shawhin/gemma-3-1b-tool-use`'s Recall (Reduce its 4 FNs):**
    *   **Analyze FN Examples:** Examine the 4 queries where a tool was needed but not called. Why did the model miss them? Was the intent unclear? Was it a rare tool?
    *   **Augment TP Examples:** Add more training data similar to these missed cases, potentially with more explicit phrasing indicating the need for the specific tool.
    *   **Improve Cue Recognition:** Fine-tune the model to be more sensitive to phrases or contexts that signal the need for a tool.

*   **To Improve `google/gemma-3-1b-it`'s Recall (Reduce its 3 FNs):**
    *   Similar to the above, analyze the 3 missed cases and enrich the training data with similar examples requiring tool calls.

**Advanced Techniques:**

1.  **Confidence Thresholding:** If the model outputs a confidence score for tool prediction, you could adjust the threshold.
    *   *Increasing* the threshold makes the model more conservative (fewer FPs, potentially more FNs) -> Increases Precision, might decrease Recall.
    *   *Decreasing* the threshold makes it more aggressive (more FPs, potentially fewer FNs) -> Decreases Precision, might increase Recall.
    (This is more applicable if the output isn't just the tool name but also a likelihood).

2.  **Reinforcement Learning from Human Feedback (RLHF):** Collect human judgments on the model's tool calls and use RL to fine-tune the model to better align with desired behavior, balancing precision and recall.

3.  **Error Analysis Loop:** Continuously:
    *   Evaluate the model.
    *   Analyze the errors (FPs and FNs).
    *   Collect or generate new data targeting these errors.
    *   Retrain/fine-tune.
    *   Repeat.

By systematically analyzing the types of mistakes each model makes and enriching the training data accordingly, you can often guide the models toward better performance on your specific needs.

In [11]:
def compare_model_performance_by_column(results_df, split_column):
    """
    Compare model performance based on tool calling metrics, split by specified column.
    
    Parameters:
    results_df (pd.DataFrame): DataFrame containing evaluation results with columns:
        - model_name
        - model_called_tool_when_needed
        - model_called_correct_tool
        - tool_call_success
        - expected_tool_name
        - split_column (the column to split analysis by)
    split_column (str): Column name to split the analysis by (e.g., 'query_type', 'num_tools_available')
    
    Returns:
    pd.DataFrame: Performance comparison with models and split_column values as rows and metrics as columns
    """
    
    # Filter for rows where a tool call was needed (expected_tool_name is not None/NaN)
    tool_needed_rows = results_df[results_df['expected_tool_name'].notna() & 
                                 (results_df['expected_tool_name'] != 'nan')]
    
    # Group by model name and split column, calculate pass rates for all rows
    performance_metrics = results_df.groupby(['model_name', split_column]).agg({
        'model_called_tool_when_needed': 'mean',
    }).round(4)
    
    # Calculate metrics only for rows where tool was needed
    if len(tool_needed_rows) > 0:
        tool_metrics = tool_needed_rows.groupby(['model_name', split_column]).agg({
            'model_called_correct_tool': 'mean',
            'tool_call_success': 'mean'
        }).round(4)
        
        # Combine the metrics
        performance_metrics = pd.concat([performance_metrics, tool_metrics], axis=1)
    else:
        # If no rows need tools, set these metrics to None
        performance_metrics['model_called_correct_tool'] = None
        performance_metrics['tool_call_success'] = None
    
    # Rename columns for clarity
    performance_metrics.columns = [
        'Tool Called When Needed (%)',
        'Correct Tool Called (%)', 
        'Tool Call Success (%)'
    ]
    
    # Convert to percentages
    performance_metrics = performance_metrics * 100
    
    return performance_metrics

#### global results

In [12]:
eval_summary = compare_model_performance(results_df)

In [13]:
eval_summary

Unnamed: 0_level_0,Tool Called When Needed (%),Correct Tool Called (%),Tool Call Success (%)
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
google/gemma-3-1b-it,90.0,35.0,50.0
shawhin/gemma-3-1b-tool-use,93.33,52.5,67.5


In [14]:
# write results to file
eval_summary.to_csv('data/eval_summary.csv', index=False)

#### results by query type

In [15]:
eval_summary_by_type = compare_model_performance_by_column(results_df, 'query_type')

In [16]:
eval_summary_by_type

Unnamed: 0_level_0,Unnamed: 1_level_0,Tool Called When Needed (%),Correct Tool Called (%),Tool Call Success (%)
model_name,query_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
google/gemma-3-1b-it,easy,100.0,50.0,65.0
google/gemma-3-1b-it,hard,100.0,20.0,35.0
google/gemma-3-1b-it,no_tool,70.0,,
shawhin/gemma-3-1b-tool-use,easy,95.0,60.0,80.0
shawhin/gemma-3-1b-tool-use,hard,85.0,45.0,55.0
shawhin/gemma-3-1b-tool-use,no_tool,100.0,,
