# Evaluate tool chains
* length for estimating complexity
* check tool against possible list of tools (absurdity of approach) 
* Word Error Rate: Substitutions, Deletion, Insertions
  * (S + D + I) / N_reference
  * if a tool is added, but all other are correct it is the same as if one tool was wrong, but the length is the same 

In [None]:
# https://github.com/analyticsinmotion/werx
import werx
from pathlib import Path
import json
import pandas as pd

In [None]:
# test word error rate
hypothesis = ["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters", "some_other"]
reference = ["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters"]
reference_multiple = [["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters"],
             ["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters", "some_other"]]

## Read data 

#### Manually created tables to compare the LLM's answer to:

In [None]:
# possible tools
prossible_tool_table_path = "preliminary_work/DraCor MCP Tools - Tabellenblatt1.csv"
possible_tools_df = pd.read_csv(prossible_tool_table_path)

# optimal length 
optimal_lengths_json_path = Path("preliminary_work/expected_tool_chains_light.json")
with optimal_lengths_json_path.open('r') as optimal_length_json:
    optimal_lengths = json.load(optimal_length_json)

# baseline chain
base_toolchain_json_path = Path("preliminary_work/mcp-evaluation-baseline-tool-chains.json")
with base_toolchain_json_path.open('r') as base_toolchain_json:
    baseline_tool_chain_raw = json.load(base_toolchain_json)
    baseline_tool_chain = {entry["ID"]: entry["baseline_tool_chains"] for entry in baseline_tool_chain_raw}
    #baseline_tool_chain_df = pd.DataFrame(baseline_tool_chain)

In [None]:
possible_tools_df = possible_tools_df.rename(columns={"Tool/Question": "tool"})
possible_tools_df.head()

In [None]:
def strip_starter(tool_path):
    tool_path = tool_path.split(":")
    return tool_path[1]

    
possible_tools_df["tool"] = possible_tools_df["tool"].apply(lambda x: strip_starter(x))

In [None]:
possible_tools_df.head()

Transpose length dict to df with question ids as column names

In [None]:
optimal_lengths_df = pd.DataFrame(optimal_lengths).set_index("ID").transpose()
optimal_lengths_df.head()

#### LLM's answer:

In [None]:
# directory to answers
sonnet_path = Path("results/sonnet-4/extracted/")
haiku_path = Path("results/haiku-4-5/extracted/")

In [None]:
# as the results also include metadata to the question and the run, we don't need the file name for mapping
def read_results(result_dir:Path, runs_to_analyse:list[int]) -> list[dict]:
    results = []
    for filepath in result_dir.iterdir():
        run = int(filepath.name.split("_")[1])
        if run in runs_to_analyse:
            with filepath.open('r') as json_in:
                result = json.load(json_in)
                if result["success"]:
                    results.append(result)    
    return results

In [None]:
runs_to_analyse_sonnet = list(range(11,21))
runs_to_analyse_haiku = list(range(1,11))

sonnet_runs = read_results(sonnet_path, runs_to_analyse_sonnet)
haiku_runs = read_results(haiku_path, runs_to_analyse_haiku)

In [None]:
len(sonnet_runs)

In [None]:
len(haiku_runs)

In [None]:
sonnet_runs[0]

## Analysis

### Length Difference
* put this into relation to tool length of reference? (if the reference chain is longer, more difference in length weigh lower?)
* what if the hypothesis chain is shorter? (e.g. reference is "get corpus name" & "get corpus", hypothesis is "get corpus") --> at the moment distance = 0 

In [None]:
# calculate with optimal tool path
length_difference = abs(len(hypothesis) - len(reference))

In [None]:
# calculate with manually set optimal length
def calculate_length_difference(hypothesis_length, reference_length):
    return abs(hypothesis_length - reference_length)

### Ratio of absurd tools used

In [None]:
possible_tools = set(["get_corpora", "get_plays_in_corpus_by_title_helper", "get_play_characters"])

In [None]:
hypothesis_set = set(hypothesis)

In [None]:
def get_absurd_tool_ratio(hypothesis_set, possible_tools):
    absurd_tools = hypothesis_set.difference(possible_tools)
    absurdity_rate = len(absurd_tools) / len(hypothesis_set)
    return absurdity_rate

### WER 
* add function to get optimal tool chain from df 

In [None]:
def get_wer_info(hypothesis_tools: list[str], reference_tools: list[list[str]]):
    reference_tools_str = [" ".join(reference) for reference in reference_tools]
    hypothesis_str = " ".join(hypothesis_tools)
    best_score = 10000 # some high number, so that the first calculation will be below (is not between 0 and 1)
    for reference_str in reference_tools_str:
        error_rate = werx.wer(reference_str, hypothesis_str)
        if error_rate < best_score:
            wer = {}
            best_score = error_rate
            results = werx.analysis(reference_str, hypothesis_str)
            result = results[0]
            
            wer["overall_error_rate"] = error_rate
            wer["insertions"] = result.inserted_words
            wer["deletions"] = result.deleted_words
            wer["substitutions"] = result.substituted_words
    return wer

### Calculate Validation Metrics 

In [None]:
def get_possible_tools(possible_tools_df, question_id):
    return set(list(possible_tools_df[possible_tools_df[question_id]>0]["tool"]))

In [None]:
def get_optimal_length(optimal_lengths_df, question_id):
    return optimal_lengths_df[question_id]["number_of_tools_expected"]

In [None]:
def validate_tool_chains(results, possible_tools_df, optimal_lengths_df, baseline_tool_chains):
    for result in results:
        question_id = result["id"]
        tools = result["tool_chain"]
        
        # length difference
        optimal_length = get_optimal_length(optimal_lengths_df, question_id) 
        result["tool_path_length_difference"] = calculate_length_difference(len(tools), optimal_length)

        # absurd tools 
        possible_tools = get_possible_tools(possible_tools_df, question_id)
        result["absurd_tool_ratio"] = get_absurd_tool_ratio(set(tools), possible_tools)

        # tool path error rate aka WER
        # todo: reference must be list of lists
        # get optimal tool chain for question_id
        result["tool_error_rate"] = get_wer_info(tools, baseline_tool_chains[question_id])
    return results

In [None]:
sonnet_evaluated = validate_tool_chains(sonnet_runs, possible_tools_df, optimal_lengths_df, baseline_tool_chain)
haiku_evaluated = validate_tool_chains(haiku_runs, possible_tools_df, optimal_lengths_df, baseline_tool_chain)

In [None]:
# haiku  tool path investigation

question_id = "3-2"

for i, entry in enumerate(sonnet_evaluated):
    if entry['id'] == question_id:
        print(entry['id'])
        print(f"{i}: Tools chain: {entry['tool_chain']}")
        print(entry['tool_path_length_difference'])
        print("-"*80)

In [None]:
# Haiku tool chain analysis
question_id = "3-2"

for i, entry in enumerate(haiku_evaluated):
    if entry['id'] == question_id:
        print(f"{i}: Tools chain: {entry['tool_chain']}")
        print(entry['tool_error_rate'])
        print("-"*80)

## Write validation results 

In [None]:
output_dir = Path("results_validated/")
output_dir_sonnet = output_dir / "sonnet-4"
output_dir_haiku = output_dir / "haiku-4-5"


In [None]:
def write_validated_results(result_path:Path, validated_runs:list[dict]):
    if not result_path.exists():
        result_path.mkdir()
    for entry in validated_runs:
        filename = result_path / f"{entry['id']}_{entry['run']}_validated-tools.json"
        with filename.open('w') as file_out:
            json.dump(entry, file_out)

In [None]:
write_validated_results(output_dir_sonnet, sonnet_evaluated)

In [None]:
write_validated_results(output_dir_haiku, haiku_evaluated)
