# Evaluate tool chains
* length for estimating complexity
* check tool against possible list of tools (absurdity of approach) 
* Word Error Rate: Substitutions, Deletion, Insertions
  * (S + D + I) / N_reference
  * if a tool is added, but all other are correct it is the same as if one tool was wrong, but the length is the same 

In [50]:
# https://github.com/analyticsinmotion/werx
import werx
from pathlib import Path
import json
import pandas as pd

In [51]:
# test word error rate
hypothesis = ["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters", "some_other"]
reference = ["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters"]
reference_multiple = [["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters"],
             ["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters", "some_other"]]

## Read data 

#### Manually created tables to compare the LLM's answer to:

In [52]:
# possible tools
prossible_tool_table_path = "preliminary_work/DraCor MCP Tools - Tabellenblatt1.csv"
possible_tools_df = pd.read_csv(prossible_tool_table_path)

# optimal length 
optimal_lengths_json_path = Path("preliminary_work/expected_tool_chains_light.json")
with optimal_lengths_json_path.open('r') as optimal_length_json:
    optimal_lengths = json.load(optimal_length_json)

# baseline chain
base_toolchain_json_path = Path("preliminary_work/mcp-evaluation-baseline-tool-chains.json")
with base_toolchain_json_path.open('r') as base_toolchain_json:
    baseline_tool_chain_raw = json.load(base_toolchain_json)
    baseline_tool_chain = {entry["ID"]: entry["baseline_tool_chains"] for entry in baseline_tool_chain_raw}
    #baseline_tool_chain_df = pd.DataFrame(baseline_tool_chain)

In [53]:
possible_tools_df = possible_tools_df.rename(columns={"Tool/Question": "tool"})
possible_tools_df.head()

Unnamed: 0,tool,1-1,1-2,1-3,1-4,1-5,2-1,3-1,3-2,4-1,4-2,4-3,4-4,5-1,5-2,5-3,5-4
0,DraCor:get_api_info,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,DraCor:get_corpora,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,DraCor:get_corpus,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,DraCor:get_corpus_metadata,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0
4,DraCor:get_corpus_metadata_csv,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0


In [54]:
def strip_starter(tool_path):
    tool_path = tool_path.split(":")
    return tool_path[1]

    
possible_tools_df["tool"] = possible_tools_df["tool"].apply(lambda x: strip_starter(x))

In [55]:
possible_tools_df.head()

Unnamed: 0,tool,1-1,1-2,1-3,1-4,1-5,2-1,3-1,3-2,4-1,4-2,4-3,4-4,5-1,5-2,5-3,5-4
0,get_api_info,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,get_corpora,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,get_corpus,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,get_corpus_metadata,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0
4,get_corpus_metadata_csv,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0


Transpose length dict to df with question ids as column names

In [56]:
optimal_lengths_df = pd.DataFrame(optimal_lengths).set_index("ID").transpose()
optimal_lengths_df.head()

ID,1-1,1-2,1-3,1-4,1-5,2-1,3-1,3-2,4-1,4-2,4-3,4-4,5-1,5-2,5-3,5-4
question,What is the number of characters in Dantons Tod?,What is the number of dramatis personae in Dan...,What is the number of characters in Dantons To...,What is the number of characters in Dantons To...,What is the number of characters in Der Nollhart?,What is the mean number of characters in Frenc...,Which corpus has the highest mean number of ch...,Which corpus covers the widest time range?,How does the percentage of female speakers in ...,How does the mean percentage of female speaker...,How does the gender distribution in Swedish dr...,How does the percentage of female speakers in ...,Who is the most important character in Emilia ...,Who is the protagonist in Emilia Galotti?,Which character is quantitatively most dominan...,Who is the protagonist in Die entführte Dose?
number_of_tools_expected,2.0,2.0,1.0,1.0,2.0,1.0,1.0,,2.0,2.0,2.0,2.0,,,,


#### LLM's answer:

In [57]:
# directory to answers
sonnet_path = Path("results/sonnet-4/extracted/")
haiku_path = Path("results/haiku-4-5/extracted/")

In [58]:
# as the results also include metadata to the question and the run, we don't need the file name for mapping
def read_results(result_dir:Path, runs_to_analyse:list[int]) -> list[dict]:
    results = []
    for filepath in result_dir.iterdir():
        run = int(filepath.name.split("_")[1])
        if run in runs_to_analyse:
            with filepath.open('r') as json_in:
                result = json.load(json_in)
                if result["success"]:
                    results.append(result)    
    return results

In [59]:
runs_to_analyse_sonnet = list(range(11,21))
runs_to_analyse_haiku = list(range(1,11))

sonnet_runs = read_results(sonnet_path, runs_to_analyse_sonnet)
haiku_runs = read_results(haiku_path, runs_to_analyse_haiku)

In [60]:
len(sonnet_runs)

130

In [61]:
len(haiku_runs)

129

In [62]:
sonnet_runs[0]

{'success': True,
 'valid': None,
 'response': "Based on the data from ItaDraCor, I can now analyze the percentage of female speakers over time:\n\n**The percentage of female speakers in ItaDraCor shows minimal change over time, consistently remaining very low (10-30%) throughout all periods from the 15th to 20th centuries.**\n\nKey observations:\n- **15th-16th centuries**: ~20-25% female speakers\n- **17th-18th centuries**: ~20-30% female speakers  \n- **19th-20th centuries**: ~15-25% female speakers\n\nThere is no significant upward or downward trend - Italian drama consistently features predominantly male characters across all time periods, with women representing roughly one-fifth to one-quarter of speakers throughout the corpus's chronological span.",
 'tools_used': [{'name': 'get_corpus',
   'input': {'corpus_name': 'ita'},
   'id': 'mcptoolu_01DHWTCNgHiPqBPuA79XPY1Q',
   'is_error': False},
  {'name': 'get_corpus_metadata_csv',
   'input': {'corpus_name': 'ita'},
   'id': 'mcpto

In [63]:
for entry in haiku_runs:
    if len(entry['tool_chain']) == 0:
        print("tools not used")

## Analysis

### Length Difference
* put this into relation to tool length of reference? (if the reference chain is longer, more difference in length weigh lower?)
* what if the hypothesis chain is shorter? (e.g. reference is "get corpus name" & "get corpus", hypothesis is "get corpus") --> at the moment distance = 0 

In [64]:
# calculate with optimal tool path
length_difference = abs(len(hypothesis) - len(reference))

In [65]:
# calculate with manually set optimal length
def calculate_length_difference(hypothesis_length, reference_length):
    if (hypothesis_length < reference_length) and (hypothesis_length != 0):
        return 0 # no difference or have special difference for this?
    return abs(hypothesis_length - reference_length)

### Ratio of absurd tools used

In [66]:
possible_tools = set(["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters"])

In [67]:
hypothesis_set = set(hypothesis)

In [68]:
def get_absurd_tool_ratio(hypothesis_set, possible_tools):
    absurd_tools = hypothesis_set.difference(possible_tools)
    absurdity_rate = len(absurd_tools) / len(hypothesis_set)
    return absurdity_rate

### WER 
* add function to get optimal tool chain from df 

In [94]:
def get_wer_info(hypothesis_tools: list[str], reference_tools: list[list[str]]):
    reference_tools_str = [" ".join(reference) for reference in reference_tools]
    hypothesis_str = " ".join(hypothesis_tools)
    best_score = 10000 # some high number, so that the first calculation will be below (is not between 0 and 1)
    for reference_str in reference_tools_str:
        error_rate = werx.wer(reference_str, hypothesis_str)
        wer = {}
        if error_rate < best_score:
            best_score = error_rate
            results = werx.analysis(reference_str, hypothesis_str)
            result = results[0]
            
            wer["overall_error_rate"] = error_rate
            wer["insertions"] = result.inserted_words
            wer["deletions"] = result.deleted_words
            wer["substitutions"] = result.substituted_words
    return wer

### Calculate Validation Metrics 

In [70]:
def get_possible_tools(possible_tools_df, question_id):
    return set(list(possible_tools_df[possible_tools_df[question_id]>0]["tool"]))

In [71]:
def get_optimal_length(optimal_lengths_df, question_id):
    return optimal_lengths_df[question_id]["number_of_tools_expected"]

In [72]:
# TODO: add optimal tool chain(s) as parameter
def validate_tool_chains(results, possible_tools_df, optimal_lengths_df, baseline_tool_chains):
    for result in results:
        question_id = result["id"]
        tools = result["tool_chain"]
        # length difference
        optimal_length = get_optimal_length(optimal_lengths_df, question_id) 
        result["tool_path_length_difference"] = calculate_length_difference(len(tools), optimal_length)

        # absurd tools 
        possible_tools = get_possible_tools(possible_tools_df, question_id)
        result["absurd_tool_ratio"] = get_absurd_tool_ratio(set(tools), possible_tools)

        # tool path error rate aka WER
        # todo: reference must be list of lists
        # get optimal tool chain for question_id
        result["tool_error_rate"] = get_wer_info(tools, baseline_tool_chains[question_id])
    return results

In [95]:
sonnet_evaluated = validate_tool_chains(sonnet_runs, possible_tools_df, optimal_lengths_df, baseline_tool_chain)
haiku_evaluated = validate_tool_chains(haiku_runs, possible_tools_df, optimal_lengths_df, baseline_tool_chain)

In [106]:
haiku_evaluated[0]

{'success': True,
 'valid': None,
 'response': 'Now let me calculate the percentage of female speakers over time by grouping plays by year:\n\n**ItaDraCor: Female Speaker Percentage Over Time**\n\nThe data shows a general **increase in female speaker representation from early periods to the 18th century**:\n\n- **1475-1500**: ~30-40% female (early humanist plays with balanced gender)\n- **1500-1550**: ~25-35% female (Renaissance comedies and tragedies vary widely)\n- **1550-1600**: ~25-45% female (humanist tragedies show diverse representation)\n- **1600-1650**: ~20-35% female (comedies of masks lower representation)\n- **1650-1750**: ~25-40% female (mixed, with opera librettos showing higher rates)\n- **1750-1800**: **40-55% female** (peak representation in Goldoni comedies and Metastasio librettos)\n- **1800+**: ~15-25% female (sharp decline with Romantic tragedies)\n\n**Key finding**: Female speaker percentage peaks in the **mid-to-late 18th century** (Goldoni/Metastasio period at ~

In [99]:
i = 0
for entry in haiku_evaluated:
    if entry['tool_error_rate']['overall_error_rate'] >=1:
        i +=1

In [101]:
len(haiku_evaluated)

129

In [97]:
baseline_tool_chain['1-5']

[['get_corpora', 'get_plays_in_corpus_by_title_helper']]

## Write validation results 

In [102]:
output_dir = Path("results_validated/")
output_dir_sonnet = output_dir / "sonnet-4"
output_dir_haiku = output_dir / "haiku-4-5"


In [103]:
def write_validated_results(result_path:Path, validated_runs:list[dict]):
    if not result_path.exists():
        result_path.mkdir()
    for entry in validated_runs:
        filename = result_path / f"{entry['id']}_{entry['run']}_validated-tools.json"
        with filename.open('w') as file_out:
            json.dump(entry, file_out)

In [104]:
write_validated_results(output_dir_sonnet, sonnet_evaluated)

In [105]:
write_validated_results(output_dir_haiku, haiku_evaluated)
