In [1145]:
import ast
import pandas as pd
import numpy as np
import json
import glob
import os
import re
from pathlib import Path
from pydracor import DraCorAPI
import plotly.express as px

## 1. Loading experiments results from JSON files to a single dataframe

In [1146]:
EXPERIMENT_PREFIXES = [
    "1-1",
    "1-2",
    "1-3",
    "1-4",
    "1-5",
    "3-1",
    "4-1",
    "4-2",
    "4-3",
    "4-4",
    "5-1",
    "5-2",
    "5-3",
    "5-4",
]

In [1147]:
model = 'haiku-4-5' #choose this for haiku-4-5
#model = 'sonnet-4' # choose this for sonnet-4

In [1148]:
# Path to the uploaded files
path = f"results/{model}/extracted/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "success": success,
        "response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df = pd.DataFrame(rows)
df


Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain
0,3-2_3_extracted-info.json,3-2,False,,False,[]
1,2-1_3_extracted-info.json,2-1,False,,False,[]
2,4-4_6_extracted-info.json,4-4,True,Now let me calculate the percentage of female ...,,[get_corpus_metadata]
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play..."
4,4-3_10_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays sp...,,[get_corpus_metadata]
...,...,...,...,...,...,...
155,4-3_5_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays fr...,,[get_corpus_metadata]
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata]
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora]
158,4-3_4_extracted-info.json,4-3,True,Based on the metadata from the Swedish Drama C...,,"[get_corpus, get_corpus_metadata]"


In [1149]:
df['experiment_id'].value_counts()

experiment_id
3-2    10
2-1    10
4-4    10
1-5    10
4-3    10
1-3    10
5-2    10
4-2    10
5-1    10
1-2    10
1-1    10
5-3    10
4-1    10
1-4    10
5-4    10
3-1    10
Name: count, dtype: int64

### Basic stats on how many successful / failed runs 

(testing for 'request failure', step 1 in Henny's diagram)

In [1150]:
total_attempts = df.shape[0]

In [1151]:
df['success'].value_counts()

success
True     129
False     31
Name: count, dtype: int64

In [1152]:
total_suscesses = df['success'].sum()

In [1153]:
df[df['tool_chain'].str.len()>0].shape[0]

129

In [1154]:
total_tool_chains = df[df['tool_chain'].str.len()>0].shape[0]

In [1155]:
# valid True or null
df[df['valid']!=False].shape[0]

126

In [1156]:
not_invalid = df[(df['valid']!=False) & (df['success']==True)].shape[0]

In [1157]:
## no color settings
data = dict(
    number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
    stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

fig = px.funnel(data, x='number', y='stage', title=model.title())
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()

In [1158]:
## no color settings
data = dict(
    number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
    stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

fig = px.funnel(data, x='number', y='stage', title=model.title(),
                color_discrete_sequence=["#1f2448"])
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()

In [1159]:
# with color settings
# data = dict(
#     number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
#     stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

# color_discrete_map={
        
#          "Total attempts": "#1f2448",
#          "Total success (got response)": "#fc9432",
#          "Total Tool Chain Uses": "#1f2448",
#          "Valid Responses (or open questions)": "#008a0e"
         
#      }

# fig = px.funnel(data, x='number', y='stage', title=model.title(), 
#                 color="stage", 
#                 color_discrete_map=color_discrete_map
#                 )
# fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
# fig.show()

In [1160]:
df[df['success']==True]['valid'].value_counts()

valid
True     96
False     3
Name: count, dtype: int64

## 2. Post-processing LLM responses for better automatic evaluation:

In [1161]:
def extract_last_number(s):
    if s is None:
        return None
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    if not nums:
        return None
    return int(nums[-1])  # take the last one

In [1162]:
df["numeric_response"] = df["response"].apply(extract_last_number)

In [1163]:
df

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response
0,3-2_3_extracted-info.json,3-2,False,,False,[],
1,2-1_3_extracted-info.json,2-1,False,,False,[],
2,4-4_6_extracted-info.json,4-4,True,Now let me calculate the percentage of female ...,,[get_corpus_metadata],19.0
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0
4,4-3_10_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays sp...,,[get_corpus_metadata],40.0
...,...,...,...,...,...,...,...
155,4-3_5_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays fr...,,[get_corpus_metadata],1890.0
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0
158,4-3_4_extracted-info.json,4-3,True,Based on the metadata from the Swedish Drama C...,,"[get_corpus, get_corpus_metadata]",4.0


In [1164]:
def extract_all_numbers(s):
    if s is None:
        return []
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    return [int(n) for n in nums]  # convert to ints

df["all_numbers"] = df["response"].apply(extract_all_numbers)

In [1165]:
df[df['experiment_id']=='1-5'][['filename', 'response', 'numeric_response', 'all_numbers']]

Unnamed: 0,filename,response,numeric_response,all_numbers
3,1-5_6_extracted-info.json,14,14.0,[14]
7,1-5_10_extracted-info.json,14,14.0,[14]
8,1-5_7_extracted-info.json,14,14.0,[14]
11,1-5_4_extracted-info.json,14,14.0,[14]
19,1-5_5_extracted-info.json,14,14.0,[14]
26,1-5_3_extracted-info.json,14,14.0,[14]
30,1-5_2_extracted-info.json,14,14.0,[14]
39,1-5_1_extracted-info.json,14,14.0,[14]
73,1-5_8_extracted-info.json,14,14.0,[14]
79,1-5_9_extracted-info.json,14,14.0,[14]


In [1166]:
df[df['experiment_id']=='1-5'][['response', 'numeric_response', 'all_numbers']]

Unnamed: 0,response,numeric_response,all_numbers
3,14,14.0,[14]
7,14,14.0,[14]
8,14,14.0,[14]
11,14,14.0,[14]
19,14,14.0,[14]
26,14,14.0,[14]
30,14,14.0,[14]
39,14,14.0,[14]
73,14,14.0,[14]
79,14,14.0,[14]


In [1167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   filename          160 non-null    object 
 1   experiment_id     160 non-null    object 
 2   success           160 non-null    bool   
 3   response          160 non-null    object 
 4   valid             130 non-null    object 
 5   tool_chain        160 non-null    object 
 6   numeric_response  98 non-null     float64
 7   all_numbers       160 non-null    object 
dtypes: bool(1), float64(1), object(6)
memory usage: 9.0+ KB


In [1168]:
# stats = (
#     df_filtered.groupby("experiment_id")["numeric_response"]
#       .agg(["count", "mean", "std", "var", "min", "max"])
# )

# # add range as max-min
# stats["range"] = stats["max"] - stats["min"]

# stats

In [1169]:
df.groupby("experiment_id").size()

experiment_id
1-1    10
1-2    10
1-3    10
1-4    10
1-5    10
2-1    10
3-1    10
3-2    10
4-1    10
4-2    10
4-3    10
4-4    10
5-1    10
5-2    10
5-3    10
5-4    10
dtype: int64

In [1170]:
df.groupby("experiment_id")["numeric_response"].std()

experiment_id
1-1      0.000000
1-2      2.213594
1-3      7.774603
1-4     14.267290
1-5      0.000000
2-1           NaN
3-1      0.000000
3-2           NaN
4-1           NaN
4-2    905.015654
4-3    957.099606
4-4    151.638642
5-1    237.384884
5-2           NaN
5-3           NaN
5-4    433.390124
Name: numeric_response, dtype: float64

### Normalise responses to select-the-corpus questions (3-1, 3-2)

In [1171]:
# normalised response will contain the same as numeric_response for numeric questions 
# but also corpus slugs for 'which corpus' questions
df['normalised_response'] = df['numeric_response'].astype('string')
df['normalised_response'] = df['normalised_response'].str.replace('.0$', '', regex=True)

In [1172]:
df['normalised_response']

0      <NA>
1      <NA>
2        19
3        14
4        40
       ... 
155    1890
156     111
157      39
158       4
159     103
Name: normalised_response, Length: 160, dtype: string

In [1173]:
# this should all be replaced by the corpus slugs 
df[df['experiment_id'].isin(['3-1', '3-2'])]['normalised_response']


0      <NA>
6      <NA>
18     <NA>
29     <NA>
33     <NA>
35     <NA>
42     <NA>
48     <NA>
53     <NA>
96       39
102      39
103      39
108      39
116      39
119      39
123      39
125    <NA>
150      39
154      39
157      39
Name: normalised_response, dtype: string

In [1174]:
crpra = DraCorAPI().get_corpora()

In [1175]:
slugs = [corpus.name for corpus in crpra]

In [1176]:
_pattern = re.compile(r'\b(?:' + '|'.join(slugs) + r')\b', flags=re.IGNORECASE)

def find_last_corpus_slug(text: str) -> str | None:
    """Return the last DraCor slug mentioned as a whole word, or None."""
    last = None
    for match in _pattern.finditer(text):
        last = match.group(0).lower()  # normalize to lowercase slug
    return last

In [1177]:
mask = df['experiment_id'].isin(['3-1', '3-2'])
df.loc[mask, 'normalised_response'] = df.loc[mask, 'response'].apply(find_last_corpus_slug)

In [1178]:
df[df['experiment_id'].isin(['3-1', '3-2'])][['success','response','normalised_response']]

Unnamed: 0,success,response,normalised_response
0,False,,
6,False,,
18,False,,
29,False,,
33,False,,
35,False,,
42,True,Now I need to get metadata for each corpus to ...,
48,False,,
53,False,,
96,True,Now I'll calculate the mean number of characte...,gersh


In [1179]:
df[(df['experiment_id'].isin(['3-1', '3-2']) & df['success']==True)][['experiment_id','success','response','normalised_response']]

Unnamed: 0,experiment_id,success,response,normalised_response
42,3-2,True,Now I need to get metadata for each corpus to ...,
96,3-1,True,Now I'll calculate the mean number of characte...,gersh
102,3-1,True,Now I'll calculate the mean number of characte...,gersh
103,3-1,True,Now I'll calculate the mean number of characte...,gersh
108,3-1,True,Now I'll calculate the mean number of characte...,gersh
116,3-1,True,Now I'll calculate the mean number of characte...,gersh
119,3-1,True,Now I'll calculate the mean number of characte...,gersh
123,3-1,True,Now I'll calculate the mean number of characte...,gersh
150,3-1,True,Now I need to calculate the mean number of cha...,gersh
154,3-1,True,Now I'll calculate the mean number of characte...,gersh


## 3. Loading manually-defined correct responses

In [1180]:
correct = pd.read_csv("curated_data/autoEva_correct-answers.csv")

In [1181]:
correct

Unnamed: 0,ID,Correct Answer
0,1-1,103
1,1-2,103
2,1-3,103
3,1-4,103
4,1-5,14
5,2-1,9.19
6,3-1,gersh
7,3-2,fre
8,4-1,Open question
9,4-2,Open question


In [1182]:
print(correct)

     ID                                     Correct Answer
0   1-1                                                103
1   1-2                                                103
2   1-3                                                103
3   1-4                                                103
4   1-5                                                 14
5   2-1                                               9.19
6   3-1                                              gersh
7   3-2                                                fre
8   4-1                                      Open question
9   4-2                                      Open question
10  4-3                                      Open question
11  4-4                                      Open question
12  5-1  ["der kammerdiener", "camillo_rota", "angelo",...
13  5-2  ["der kammerdiener", "camillo_rota", "angelo",...
14  5-3                                          marinelli
15  5-4  ["dratmann", "foppendorf", "christinchen", "ch.

In [1183]:
correct_dict = dict(zip(correct["ID"], correct["Correct Answer"]))

In [1184]:
df['correct_answer'] = df['experiment_id'].map(correct_dict)

In [1185]:
df.head()

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19
2,4-4_6_extracted-info.json,4-4,True,Now let me calculate the percentage of female ...,,[get_corpus_metadata],19.0,"[18, 1475, 1500, 30, 40, 1500, 1550, 25, 35, 1...",19.0,Open question
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14.0,14
4,4-3_10_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays sp...,,[get_corpus_metadata],40.0,"[67, 1880, 1900, 1880, 1883, 4, 3, 1884, 8, 7,...",40.0,Open question


In [1186]:
print(df[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

  experiment_id  numeric_response correct_answer
0           3-2               NaN            fre
1           2-1               NaN           9.19
2           4-4              19.0  Open question
3           1-5              14.0             14
4           4-3              40.0  Open question
5           2-1               NaN           9.19
6           3-2               NaN            fre
7           1-5              14.0             14
8           1-5              14.0             14
9           4-4              18.0  Open question


In [1187]:
df_strictly_numeric = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') ]

In [1188]:
df_strictly_numeric.shape

(60, 10)

In [1189]:
print(df_strictly_numeric[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

   experiment_id  numeric_response correct_answer
1            2-1               NaN           9.19
3            1-5              14.0             14
5            2-1               NaN           9.19
7            1-5              14.0             14
8            1-5              14.0             14
11           1-5              14.0             14
12           1-3             127.0            103
15           2-1               NaN           9.19
19           1-5              14.0             14
21           1-2             103.0            103


In [1190]:
df_strictly_numeric[df_strictly_numeric['experiment_id'] == '1-3']

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127,103
24,1-3_8_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
56,1-3_1_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
58,1-3_2_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
63,1-3_3_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
68,1-3_10_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
69,1-3_5_extracted-info.json,1-3,True,111,True,[get_play_metadata],111.0,[111],111,103
77,1-3_4_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
84,1-3_7_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
93,1-3_6_extracted-info.json,1-3,True,111,True,[get_play_metadata],111.0,[111],111,103


## 4. Evaluating correctness of the LLM response (hit & miss table)

In [1191]:
def hit_miss(df, with_emojis=True):
    df = df.copy()
    df["is_correct"] = df["normalised_response"] == df["correct_answer"]
    df["iteration"] = df.groupby("experiment_id").cumcount() + 1
    df["question_id"] = df["experiment_id"]

    if with_emojis:
        df["emoji"] = df["is_correct"].map({1: "✅", 0: "❌"})
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="emoji")
            .sort_index()
            .sort_index(axis=1)
        )
    else:
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="is_correct")
            .sort_index()
            .sort_index(axis=1)
            .astype("Int64")
        )

    summary = (
        df.groupby("question_id")["is_correct"]
        .agg(["sum", "count"])
        .assign(
            label=lambda s: s.apply(
                lambda r: f"{r['sum']} correct answers of {r['count']} total answers",
                axis=1,
            )
        )
    )
    hit_table["Summary"] = summary.loc[hit_table.index, "label"]

    overall = summary[["sum", "count"]].sum()
    hit_table.loc["All experiments", :] = None
    hit_table.loc["All experiments", "Summary"] = (
        f"{overall['sum']} correct answers of {overall['count']} total answers"
    )

    return hit_table


In [1192]:
## Revised hit_miss function to handle multiple acceptable answers

import ast
import pandas as pd

def hit_miss(df, with_emojis=True):
    df = df.copy()

    def _to_answer_set(x):
        if pd.isna(x):
            return set()

        if isinstance(x, (list, tuple, set)):
            return {str(v).strip() for v in x if not pd.isna(v)}

        if isinstance(x, str):
            s = x.strip()
            if s.startswith("[") and s.endswith("]"):
                try:
                    parsed = ast.literal_eval(s)
                    if isinstance(parsed, (list, tuple, set)):
                        return {str(v).strip() for v in parsed if not pd.isna(v)}
                except (ValueError, SyntaxError):
                    pass
            return {s}

        return {str(x).strip()}

    # IDs
    df["question_id"] = df["experiment_id"]

    # Define what counts as an "answered" run:
    # - if a boolean 'success' exists, use it
    # - else infer from normalised_response being non-missing
    if "success" in df.columns:
        df["answered"] = df["success"].astype(bool)
    else:
        df["answered"] = ~pd.isna(df["normalised_response"])

    # Precompute acceptable answers per question
    acceptable = (
        df.groupby("question_id")["correct_answer"]
          .first()
          .apply(_to_answer_set)
          .to_dict()
    )

    def _is_correct_row(r):
        if not r["answered"]:
            return pd.NA  # <-- key change: non-answer stays NA (blank), not False
        ans = r["normalised_response"]
        return str(ans).strip() in acceptable.get(r["question_id"], set())

    df["is_correct"] = df.apply(_is_correct_row, axis=1)

    # Iteration numbering stays based on experiment_id (same as before)
    df["iteration"] = df.groupby("experiment_id").cumcount() + 1

    # Build table
    if with_emojis:
        df["emoji"] = df["is_correct"].map({True: "✅", False: "❌"})
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="emoji")
              .sort_index()
              .sort_index(axis=1)
        )
    else:
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="is_correct")
              .sort_index()
              .sort_index(axis=1)
              .astype("Int64")  # keeps <NA> as blank in CSV
        )

    # Summary: denominator should be ANSWERED runs only (i.e., is_correct not NA)
    summary = (
        df.groupby("question_id")["is_correct"]
          .agg(
              n_correct=lambda s: (s == True).sum(),
              n_answered=lambda s: s.notna().sum(),
          )
          .assign(
              label=lambda s: s.apply(
                  lambda r: f"{int(r['n_correct'])} correct answers of {int(r['n_answered'])} total answers",
                  axis=1,
              )
          )
    )

    hit_table["Summary"] = summary.loc[hit_table.index, "label"]

    # Overall: same denominator logic
    overall_correct = int(summary["n_correct"].sum())
    overall_answered = int(summary["n_answered"].sum())

    hit_table.loc["All experiments", :] = None
    hit_table.loc["All experiments", "Summary"] = (
        f"{overall_correct} correct answers of {overall_answered} total answers"
    )

    return hit_table


The version with "✅" and "❌" emojis:

In [1193]:
hit_miss(df_strictly_numeric)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
1-2,✅,✅,❌,✅,✅,✅,✅,✅,✅,✅,9 correct answers of 10 total answers
1-3,❌,✅,✅,✅,✅,✅,❌,✅,✅,❌,7 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,36 correct answers of 50 total answers


The version with 0 and 1

In [1194]:
#hit_table = hit_miss(df_strictly_numeric, with_emojis=False)
#hit_table.to_csv("hit_miss_table.csv")

What's up with 1-4? 

In [1195]:
df[df['experiment_id']=='1-4']

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
82,1-4_10_extracted-info.json,1-4,True,130,True,[get_play_metadata],130.0,[130],130,103
97,1-4_4_extracted-info.json,1-4,True,129,True,[get_play_metadata],129.0,[129],129,103
101,1-4_5_extracted-info.json,1-4,True,101,True,[get_play_metadata],101.0,[101],101,103
104,1-4_6_extracted-info.json,1-4,True,136,True,[get_play_metadata],136.0,[136],136,103
107,1-4_7_extracted-info.json,1-4,True,130,True,[get_play_metadata],130.0,[130],130,103
111,1-4_1_extracted-info.json,1-4,True,141,True,[get_play_metadata],141.0,[141],141,103
120,1-4_3_extracted-info.json,1-4,True,105,True,[get_play_metadata],105.0,[105],105,103
122,1-4_2_extracted-info.json,1-4,True,106,True,[get_play_metadata],106.0,[106],106,103
151,1-4_8_extracted-info.json,1-4,True,"Looking at the metadata, the characters array ...",False,[get_play_metadata],121.0,[121],121,103
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0,[111],111,103


## 6. Extend evaluation to 3-1, 3-2

In [1196]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') ]

In [1197]:
df_precise_answers

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19
6,3-2_2_extracted-info.json,3-2,False,,False,[],,[],,fre
...,...,...,...,...,...,...,...,...,...,...
153,1-2_5_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103
154,3-1_10_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0,[111],111,103
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh


In [1198]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
1-2,✅,✅,❌,✅,✅,✅,✅,✅,✅,✅,9 correct answers of 10 total answers
1-3,❌,✅,✅,✅,✅,✅,❌,✅,✅,❌,7 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
3-2,,,,,,,❌,,,,0 correct answers of 1 total answers
All experiments,,,,,,,,,,,46 correct answers of 61 total answers


In [1199]:
df01 = hit_miss(df_precise_answers, with_emojis=False)
df01

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 correct answers of 10 total answers
1-2,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9 correct answers of 10 total answers
1-3,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,7 correct answers of 10 total answers
1-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 correct answers of 10 total answers
1-5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 correct answers of 10 total answers
3-2,,,,,,,0.0,,,,0 correct answers of 1 total answers
All experiments,,,,,,,,,,,46 correct answers of 61 total answers


In [1200]:
#df01.to_csv("results/hit_miss_table.csv")

In [1201]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
1-2,✅,✅,❌,✅,✅,✅,✅,✅,✅,✅,9 correct answers of 10 total answers
1-3,❌,✅,✅,✅,✅,✅,❌,✅,✅,❌,7 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
3-2,,,,,,,❌,,,,0 correct answers of 1 total answers
All experiments,,,,,,,,,,,46 correct answers of 61 total answers


In [1202]:
df_precise_answers.query('success == True and normalised_response != correct_answer')[['filename','normalised_response', 
                                                                                      'correct_answer']]

Unnamed: 0,filename,normalised_response,correct_answer
12,1-3_9_extracted-info.json,127,103
69,1-3_5_extracted-info.json,111,103
82,1-4_10_extracted-info.json,130,103
93,1-3_6_extracted-info.json,111,103
97,1-4_4_extracted-info.json,129,103
100,1-2_8_extracted-info.json,96,103
101,1-4_5_extracted-info.json,101,103
104,1-4_6_extracted-info.json,136,103
107,1-4_7_extracted-info.json,130,103
111,1-4_1_extracted-info.json,141,103


## 7. Extend evaluation to 5- questions

In [1203]:
def get_last_token_as_response(somestring):
    if not isinstance(somestring, str):
        return None
    tokens = somestring.strip().split()
    if not tokens:
        return None
    return tokens[-1]

In [1204]:
mask = df['experiment_id'].str.startswith('5-')

df.loc[mask, 'normalised_response'] = (
    df.loc[mask, 'response']
      .apply(get_last_token_as_response)
      .str.lower()
)

In [1205]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
13,5-2_10_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],emilia,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
17,5-1_9_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
22,5-1_8_extracted-info.json,5-1,True,"Based on the network metrics, the most importa...",True,"[get_play_metadata, get_play_metrics]",9.0,"[9, 9]",marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
27,5-1_10_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
37,5-3_8_extracted-info.json,5-3,False,,False,[],,[],,marinelli
44,5-3_9_extracted-info.json,5-3,True,"Looking at the character data, I need to count...",True,[get_spoken_text_by_characters],,[],prinz,marinelli
46,5-3_4_extracted-info.json,5-3,True,Marinelli,True,[get_play_characters],,[],marinelli,marinelli
49,5-3_5_extracted-info.json,5-3,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],marinelli,marinelli
51,5-1_1_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_characters, get_play_metrics]",6.0,"[9, 0, 449, 9, 0, 449, 0, 247, 8, 0, 467, 0, 7...",marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
57,5-1_2_extracted-info.json,5-1,True,Marinelli,True,"[get_play_characters, get_play_metrics]",,[],marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."


In [1206]:
## hardcoded fix for now
#mask = (df["normalised_response"] == "Prinz") & (df["experiment_id"] != "5-3")
#df.loc[mask, 'correct_answer'] = 'Prinz'

#mask = (df['normalised_response']=='Der_Prinz') & (df["experiment_id"] != "5-3")
#df.loc[mask, 'correct_answer'] = 'Der_Prinz'

In [1207]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
13,5-2_10_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],emilia,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
17,5-1_9_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
22,5-1_8_extracted-info.json,5-1,True,"Based on the network metrics, the most importa...",True,"[get_play_metadata, get_play_metrics]",9.0,"[9, 9]",marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
27,5-1_10_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
37,5-3_8_extracted-info.json,5-3,False,,False,[],,[],,marinelli
44,5-3_9_extracted-info.json,5-3,True,"Looking at the character data, I need to count...",True,[get_spoken_text_by_characters],,[],prinz,marinelli
46,5-3_4_extracted-info.json,5-3,True,Marinelli,True,[get_play_characters],,[],marinelli,marinelli
49,5-3_5_extracted-info.json,5-3,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],marinelli,marinelli
51,5-1_1_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_characters, get_play_metrics]",6.0,"[9, 0, 449, 9, 0, 449, 0, 247, 8, 0, 467, 0, 7...",marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
57,5-1_2_extracted-info.json,5-1,True,Marinelli,True,"[get_play_characters, get_play_metrics]",,[],marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."


In [1208]:
df.columns 

Index(['filename', 'experiment_id', 'success', 'response', 'valid',
       'tool_chain', 'numeric_response', 'all_numbers', 'normalised_response',
       'correct_answer'],
      dtype='object')

Output format for saving to csv (put 'response' as the last column because they are very long)

In [1209]:
df[['filename', 'experiment_id', 'success', 'valid',
       'tool_chain', 'normalised_response', 
       'correct_answer', 'numeric_response', 'all_numbers', 'response']]

Unnamed: 0,filename,experiment_id,success,valid,tool_chain,normalised_response,correct_answer,numeric_response,all_numbers,response
0,3-2_3_extracted-info.json,3-2,False,False,[],,fre,,[],
1,2-1_3_extracted-info.json,2-1,False,False,[],,9.19,,[],
2,4-4_6_extracted-info.json,4-4,True,,[get_corpus_metadata],19,Open question,19.0,"[18, 1475, 1500, 30, 40, 1500, 1550, 25, 35, 1...",Now let me calculate the percentage of female ...
3,1-5_6_extracted-info.json,1-5,True,True,"[get_plays_in_corpus_by_title_helper, get_play...",14,14,14.0,[14],14
4,4-3_10_extracted-info.json,4-3,True,,[get_corpus_metadata],40,Open question,40.0,"[67, 1880, 1900, 1880, 1883, 4, 3, 1884, 8, 7,...",Based on the Swedish drama corpus (67 plays sp...
...,...,...,...,...,...,...,...,...,...,...
155,4-3_5_extracted-info.json,4-3,True,,[get_corpus_metadata],1890,Open question,1890.0,"[67, 1880, 1900, 1880, 1889, 60, 65, 35, 40, 1...",Based on the Swedish drama corpus (67 plays fr...
156,1-4_9_extracted-info.json,1-4,True,True,[get_play_metadata],111,103,111.0,[111],111
157,3-1_8_extracted-info.json,3-1,True,True,[get_corpora],gersh,gersh,39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",Now I'll calculate the mean number of characte...
158,4-3_4_extracted-info.json,4-3,True,,"[get_corpus, get_corpus_metadata]",4,Open question,4.0,"[68, 1880, 1900, 1880, 1886, 2, 1, 1887, 1893,...",Based on the metadata from the Swedish Drama C...


In [1210]:
df[['filename', 'experiment_id', 'success', 'valid',
       'tool_chain', 'normalised_response', 
       'correct_answer', 'numeric_response', 'all_numbers', 'response']].to_csv(f"results/compiled_responses_{model}.csv", index=False)

Select only questions with non-open answers

In [1211]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') |
                         df['experiment_id'].str.startswith('5-')
                         ]

In [1212]:
df_precise_answers = df_precise_answers.copy()

In [1213]:
## how many questions do we cover here? should be 12
df_precise_answers['experiment_id'].unique().shape[0]

12

### get stats for the funnel

In [1214]:
## to handle answers that have multiple acceptable options
def _to_answer_set(x):
    if pd.isna(x):
        return set()

    if isinstance(x, (list, tuple, set)):
        return {str(v).strip() for v in x if not pd.isna(v)}

    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple, set)):
                    return {str(v).strip() for v in parsed if not pd.isna(v)}
            except (ValueError, SyntaxError):
                pass
        return {s}

    return {str(x).strip()}

In [1215]:
# Build acceptable-answer sets per question
answer_sets = (
    df_precise_answers
        .groupby("experiment_id")["correct_answer"]
        .first()
        .apply(_to_answer_set)
)

In [1216]:
total_non_open = df_precise_answers.shape[0]
total_non_open

120

In [1217]:
total_non_open

120

In [1218]:
non_open_success = df_precise_answers['success'].sum()
non_open_success

np.int64(99)

In [1219]:
non_open_tool_chains = df_precise_answers[df_precise_answers['tool_chain'].str.len()>0].shape[0]
non_open_tool_chains

99

In [1220]:
non_open_suc_valid = df_precise_answers[(df_precise_answers['valid']!=False) 
                                          & (df_precise_answers['success']==True)].shape[0]
non_open_suc_valid

96

In [1221]:
df_precise_answers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 159
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   filename             120 non-null    object 
 1   experiment_id        120 non-null    object 
 2   success              120 non-null    bool   
 3   response             120 non-null    object 
 4   valid                120 non-null    object 
 5   tool_chain           120 non-null    object 
 6   numeric_response     68 non-null     float64
 7   all_numbers          120 non-null    object 
 8   normalised_response  98 non-null     string 
 9   correct_answer       120 non-null    object 
dtypes: bool(1), float64(1), object(7), string(1)
memory usage: 9.5+ KB


In [1222]:
# Apply membership test
df_precise_answers["is_correct_raw"] = df_precise_answers.apply(
    lambda r: str(r["response"]).strip().lower()
              in answer_sets.get(r["experiment_id"], set()),
    axis=1
)

In [1223]:
## basic comparison
#df_precise_answers['is_correct_raw'] = df_precise_answers['response'].astype(str) == df_precise_answers['correct_answer'].astype(str)

In [1224]:
non_open_correct_raw = df_precise_answers['is_correct_raw'].sum()
non_open_correct_raw

np.int64(62)

In [1225]:
# correct ones
df_precise_answers[df_precise_answers['is_correct_raw']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
7,1-5_10_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
8,1-5_7_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
11,1-5_4_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
13,5-2_10_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],emilia,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",...",True
...,...,...,...,...,...,...,...,...,...,...,...
146,1-2_6_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,True
147,5-2_2_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],emilia,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",...",True
149,5-2_1_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],emilia,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",...",True
153,1-2_5_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,True


In [1226]:
# wrong ones
df_precise_answers[~df_precise_answers['is_correct_raw'] & df_precise_answers['success']==True]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127,103,False
22,5-1_8_extracted-info.json,5-1,True,"Based on the network metrics, the most importa...",True,"[get_play_metadata, get_play_metrics]",9.0,"[9, 9]",marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",...",False
42,3-2_5_extracted-info.json,3-2,True,Now I need to get metadata for each corpus to ...,True,"[get_corpora, get_corpus_metadata, get_corpus_...",,[],,fre,False
44,5-3_9_extracted-info.json,5-3,True,"Looking at the character data, I need to count...",True,[get_spoken_text_by_characters],,[],prinz,marinelli,False
51,5-1_1_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_characters, get_play_metrics]",6.0,"[9, 0, 449, 9, 0, 449, 0, 247, 8, 0, 467, 0, 7...",marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",...",False
64,5-1_3_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_metadata, get_play_metrics]",383.0,"[9, 0, 247, 9, 0, 247, 0, 449, 8, 0, 467, 0, 3...",marinelli,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",...",False
69,1-3_5_extracted-info.json,1-3,True,111,True,[get_play_metadata],111.0,[111],111,103,False
75,5-3_10_extracted-info.json,5-3,True,"Looking at the spoken text data, I need to cou...",True,"[get_play_metadata, get_spoken_text_by_charact...",,[],prinz,marinelli,False
82,1-4_10_extracted-info.json,1-4,True,130,True,[get_play_metadata],130.0,[130],130,103,False
83,5-3_3_extracted-info.json,5-3,True,Der Prinz,True,"[get_play_metadata, get_play_metrics]",,[],prinz,marinelli,False


In [1227]:
# wrong ones
#df_precise_answers[~df_precise_answers['is_correct_raw'] & df_precise_answers['success']==True][['filename','response','normalised_response', 'correct_answer']].to_csv(f"results/wrong_responses_{model}.csv", index=False)

In [1228]:

# Apply membership test
df_precise_answers["is_correct_norm"] = df_precise_answers.apply(
    lambda r: str(r["normalised_response"]).strip().lower()
              in answer_sets.get(r["experiment_id"], set()),
    axis=1
)

In [1229]:
#check_norm = df_precise_answers['normalised_response'].astype(str) == df_precise_answers['correct_answer'].astype(str)
#df_precise_answers['is_correct_norm'] = check_norm
non_open_correct_norm = df_precise_answers['is_correct_norm'].sum()
non_open_correct_norm

np.int64(81)

In [1230]:
# mismatch of the normalised answer with the correct on (so, REALLY wrong)
df_precise_answers[~df_precise_answers['is_correct_norm']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
6,3-2_2_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127,103,False,False
15,2-1_1_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
18,3-2_1_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
28,2-1_6_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
29,3-2_6_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
33,3-2_7_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False


In [1231]:
from IPython.display import HTML

HTML("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');
</style>
""")

fig.update_layout(
    font=dict(family="Inter, sans-serif", size=12, color="#1f2444")
)



In [1232]:
data = dict(
    number=[total_non_open, non_open_success, 
            non_open_tool_chains, non_open_suc_valid, 
            non_open_correct_raw, non_open_correct_norm
            ],
    stage=["Total attempts (non-open questions)", "Total success (got response)", 
           "Total Tool Chain Uses", "Valid Responses",
           "Correct answers (direct match)", "Correct answers (direct + normalised match)"
           ])

fig = px.funnel(data, x='number', y='stage', title=model.title(),
                color_discrete_sequence=["#1f2448"])

fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks


fig.update_layout(
    font=dict(family="Inter, sans-serif", size=14, color="#1f2444")
)

fig.write_image(f"results/images/{model}_results_funnel.png", scale=300/96)
fig.show()

## 8. Add toolchain evaluation

Get toolchain validation data into a separate df

In [1233]:
# Path to the uploaded files
path = f"results_validated/{model}/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Run ID is always the first part before the first "_"
    run_id = filename.split("_validated")[0]  # e.g. "1-1_17"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)
    absurd_tool_ratio = data.get("absurd_tool_ratio", None)
    tool_path_length_difference = data.get("tool_path_length_difference", None)
    tool_error_rate = data.get("tool_error_rate", None)
    overall_error_rate = tool_error_rate.get("overall_error_rate")

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "run_id": run_id,
        "absurd_tool_ratio": absurd_tool_ratio,
        "overall_error_rate": overall_error_rate,
        "tool_path_length_difference": tool_path_length_difference,
        "success": success,
        #"response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df_tool_chains = pd.DataFrame(rows)
df_tool_chains

Unnamed: 0,filename,experiment_id,run_id,absurd_tool_ratio,overall_error_rate,tool_path_length_difference,success,valid,tool_chain
0,3-1_9_validated-tools.json,3-1,3-1_9,0.0,0.0,0,True,True,[get_corpora]
1,5-1_10_validated-tools.json,5-1,5-1_10,0.0,0.5,1,True,True,"[get_play_metadata, get_play_metrics]"
2,5-2_5_validated-tools.json,5-2,5-2_5,0.0,0.0,0,True,True,[get_play_characters]
3,4-4_5_validated-tools.json,4-4,4-4_5,0.0,0.0,0,True,,[get_corpus_metadata]
4,1-2_5_validated-tools.json,1-2,1-2_5,0.0,1.0,1,True,True,"[get_plays_in_corpus_by_title_helper, get_play..."
...,...,...,...,...,...,...,...,...,...
124,5-2_3_validated-tools.json,5-2,5-2_3,0.0,0.0,0,True,True,[get_play_characters]
125,1-5_9_validated-tools.json,1-5,1-5_9,0.0,1.0,1,True,True,"[get_plays_in_corpus_by_title_helper, get_play..."
126,5-2_6_validated-tools.json,5-2,5-2_6,0.0,0.0,0,True,True,[get_play_characters]
127,4-4_6_validated-tools.json,4-4,4-4_6,0.0,0.0,0,True,,[get_corpus_metadata]


### Tool efficiency averages for the model

In [1234]:
df_tool_chains['absurd_tool_ratio'].mean()

np.float64(0.033591731266149866)

In [1235]:
df_tool_chains['absurd_tool_ratio'].value_counts()

absurd_tool_ratio
0.000000    121
0.500000      6
1.000000      1
0.333333      1
Name: count, dtype: int64

In [1236]:
df_tool_chains['overall_error_rate'].mean()

np.float64(0.6293425208153891)

In [1237]:
df_tool_chains['overall_error_rate'].value_counts()

overall_error_rate
1.000000    56
0.000000    49
0.500000    14
2.000000     9
0.185185     1
Name: count, dtype: int64

In [1238]:
df_tool_chains['tool_path_length_difference'].mean()

np.float64(0.689922480620155)

In [1239]:
df_tool_chains['tool_path_length_difference'].value_counts()

tool_path_length_difference
1    59
0    56
2    13
4     1
Name: count, dtype: int64

In [1240]:
df_tool_chains['tool_path_length_difference']

0      0
1      1
2      0
3      0
4      1
      ..
124    0
125    1
126    0
127    0
128    2
Name: tool_path_length_difference, Length: 129, dtype: int64

In [1241]:
df_tool_chains.groupby('experiment_id')['overall_error_rate'].mean()

experiment_id
1-1    1.000000
1-2    1.000000
1-3    1.700000
1-4    1.000000
1-5    1.000000
3-1    0.000000
3-2    0.185185
4-2    0.700000
4-3    0.400000
4-4    0.400000
5-1    0.400000
5-2    0.000000
5-3    0.437500
5-4    0.150000
Name: overall_error_rate, dtype: float64

In [1242]:
df

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19
2,4-4_6_extracted-info.json,4-4,True,Now let me calculate the percentage of female ...,,[get_corpus_metadata],19.0,"[18, 1475, 1500, 30, 40, 1500, 1550, 25, 35, 1...",19,Open question
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14
4,4-3_10_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays sp...,,[get_corpus_metadata],40.0,"[67, 1880, 1900, 1880, 1883, 4, 3, 1884, 8, 7,...",40,Open question
...,...,...,...,...,...,...,...,...,...,...
155,4-3_5_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays fr...,,[get_corpus_metadata],1890.0,"[67, 1880, 1900, 1880, 1889, 60, 65, 35, 40, 1...",1890,Open question
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0,[111],111,103
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh
158,4-3_4_extracted-info.json,4-3,True,Based on the metadata from the Swedish Drama C...,,"[get_corpus, get_corpus_metadata]",4.0,"[68, 1880, 1900, 1880, 1886, 2, 1, 1887, 1893,...",4,Open question


### Combine with correctness info and analyse correlation

In [1243]:
df_precise_answers['run_id'] = df_precise_answers['filename'].apply(lambda x: x.split("_extracted")[0])
df_precise_answers['run_id']

0       3-2_3
1       2-1_3
3       1-5_6
5       2-1_2
6       3-2_2
        ...  
153     1-2_5
154    3-1_10
156     1-4_9
157     3-1_8
159     1-2_4
Name: run_id, Length: 120, dtype: object

In [1244]:
to_merge = df_precise_answers[['run_id', 'is_correct_norm', 'is_correct_raw']]
to_merge = to_merge.rename(columns={"is_correct_raw": "is_correct_raw"})
to_merge

Unnamed: 0,run_id,is_correct_norm,is_correct_raw
0,3-2_3,False,False
1,2-1_3,False,False
3,1-5_6,True,True
5,2-1_2,False,False
6,3-2_2,False,False
...,...,...,...
153,1-2_5,True,True
154,3-1_10,True,False
156,1-4_9,False,False
157,3-1_8,True,False


In [1245]:
merged = (
    df_tool_chains.merge(
        to_merge,  
        on="run_id",
        how="left",  
        validate="one_to_one"
    )
)

merged.head()

Unnamed: 0,filename,experiment_id,run_id,absurd_tool_ratio,overall_error_rate,tool_path_length_difference,success,valid,tool_chain,is_correct_norm,is_correct_raw
0,3-1_9_validated-tools.json,3-1,3-1_9,0.0,0.0,0,True,True,[get_corpora],True,False
1,5-1_10_validated-tools.json,5-1,5-1_10,0.0,0.5,1,True,True,"[get_play_metadata, get_play_metrics]",True,True
2,5-2_5_validated-tools.json,5-2,5-2_5,0.0,0.0,0,True,True,[get_play_characters],True,True
3,4-4_5_validated-tools.json,4-4,4-4_5,0.0,0.0,0,True,,[get_corpus_metadata],,
4,1-2_5_validated-tools.json,1-2,1-2_5,0.0,1.0,1,True,True,"[get_plays_in_corpus_by_title_helper, get_play...",True,True


In [1246]:
tool_use_per_exp_ID = merged.groupby('experiment_id')[['tool_path_length_difference', 'absurd_tool_ratio', 'overall_error_rate']].mean()

In [1247]:
tool_use_per_exp_ID.to_csv(f"results/{model}_tool_use_per_experiment_id.csv")

In [1248]:
corr = merged["overall_error_rate"].corr(merged["is_correct_norm"])
print(corr)

-0.21331392291153847


In [1249]:
corr = merged["tool_path_length_difference"].corr(merged["is_correct_norm"])
print(corr)

0.23171454978810405


In [1250]:
corr = merged["absurd_tool_ratio"].corr(merged["is_correct_norm"])
print(corr)

-0.24509803921568624


In [1251]:
df_precise_answers.groupby('experiment_id')['is_correct_norm'].mean()

experiment_id
1-1    1.0
1-2    0.9
1-3    0.7
1-4    0.0
1-5    1.0
2-1    0.0
3-1    1.0
3-2    0.0
5-1    1.0
5-2    1.0
5-3    0.5
5-4    1.0
Name: is_correct_norm, dtype: float64

In [1252]:
corr = merged["overall_error_rate"].corr(merged["tool_path_length_difference"])
print(corr)

0.5969207786641286


In [1253]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] != True)].shape

(17, 11)

In [1254]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] == True)].shape

(32, 11)

In [1255]:
merged[(merged['overall_error_rate'] == 0)].shape

(49, 11)

In [1256]:
df.query('experiment_id == "4-1"')

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
52,4-1_10_extracted-info.json,4-1,False,,False,[],,[],,Open question
112,4-1_8_extracted-info.json,4-1,False,,False,[],,[],,Open question
114,4-1_9_extracted-info.json,4-1,False,,False,[],,[],,Open question
126,4-1_6_extracted-info.json,4-1,False,,False,[],,[],,Open question
129,4-1_7_extracted-info.json,4-1,False,,False,[],,[],,Open question
134,4-1_4_extracted-info.json,4-1,False,,False,[],,[],,Open question
140,4-1_5_extracted-info.json,4-1,False,,False,[],,[],,Open question
143,4-1_3_extracted-info.json,4-1,False,,False,[],,[],,Open question
148,4-1_2_extracted-info.json,4-1,False,,False,[],,[],,Open question
152,4-1_1_extracted-info.json,4-1,False,,False,[],,[],,Open question


### 2026-01-08 Variance analysis

In [1257]:
df_precise_answers.query('experiment_id == "5-2" and success == True')[['filename','response','normalised_response', 'numeric_response', 'correct_answer']]

Unnamed: 0,filename,response,normalised_response,numeric_response,correct_answer
13,5-2_10_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
110,5-2_8_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
115,5-2_9_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
124,5-2_6_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
130,5-2_7_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
135,5-2_4_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
139,5-2_5_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
144,5-2_3_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
147,5-2_2_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
149,5-2_1_extracted-info.json,Emilia,emilia,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."


In [1258]:
df_precise_answers.query('experiment_id == "5-1" and success == True')[['filename','response','normalised_response', 'numeric_response', 'correct_answer']]

Unnamed: 0,filename,response,normalised_response,numeric_response,correct_answer
17,5-1_9_extracted-info.json,Marinelli,marinelli,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
22,5-1_8_extracted-info.json,"Based on the network metrics, the most importa...",marinelli,9.0,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
27,5-1_10_extracted-info.json,Marinelli,marinelli,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
51,5-1_1_extracted-info.json,"Based on the network metrics, Marinelli has th...",marinelli,6.0,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
57,5-1_2_extracted-info.json,Marinelli,marinelli,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
64,5-1_3_extracted-info.json,"Based on the network metrics, Marinelli has th...",marinelli,383.0,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
67,5-1_5_extracted-info.json,Der Prinz,prinz,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
78,5-1_4_extracted-info.json,der_prinz,der_prinz,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
86,5-1_7_extracted-info.json,Der Prinz,prinz,,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."
91,5-1_6_extracted-info.json,"Based on the network metrics, Marinelli has th...",marinelli,449.0,"[""der kammerdiener"", ""camillo_rota"", ""angelo"",..."


In [1259]:
print(df_precise_answers.query('experiment_id == "4-1" and success == True')['normalised_response'])

Series([], Name: normalised_response, dtype: string)


In [1260]:
df_precise_answers.query('experiment_id == "4-1"')

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm,run_id


In [1261]:
p = df_precise_answers.query('experiment_id == "5-1" and success == True')['normalised_response'].value_counts(normalize=True)
gini = 1 - np.sum(p**2)
print(gini)

0.4600000000000001


In [1262]:
p = df_precise_answers.query('experiment_id == "5-2" and success == True')['normalised_response'].value_counts(normalize=True)
gini = 1 - np.sum(p**2)
print(gini)

0.0


In [1263]:
print(df_precise_answers.query('success == True')[['experiment_id', 'normalised_response']].head(10))

   experiment_id normalised_response
3            1-5                  14
7            1-5                  14
8            1-5                  14
11           1-5                  14
12           1-3                 127
13           5-2              emilia
17           5-1           marinelli
19           1-5                  14
21           1-2                 103
22           5-1           marinelli


In [1264]:
gini_impurity = (
    df.groupby("experiment_id")["normalised_response"]
      .apply(lambda s: 1 - np.sum(s.value_counts(normalize=True).to_numpy() ** 2))
      .rename("gini_impurity")
      .reset_index()
)

gini_impurity

Unnamed: 0,experiment_id,gini_impurity
0,1-1,0.0
1,1-2,0.18
2,1-3,0.46
3,1-4,0.88
4,1-5,0.0
5,2-1,1.0
6,3-1,0.0
7,3-2,1.0
8,4-1,1.0
9,4-2,0.86


In [1265]:
gini_impurity.to_csv(f"results/{model}_gini_impurity.csv", index=False)

In [1266]:
df_precise_answers

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm,run_id
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False,3-2_3
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_3
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True,True,1-5_6
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_2
6,3-2_2_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False,3-2_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,1-2_5_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,True,True,1-2_5
154,3-1_10_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh,False,True,3-1_10
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0,[111],111,103,False,False,1-4_9
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh,False,True,3-1_8


In [1267]:
df_precise_answers.query('experiment_id == "2-1"')

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm,run_id
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_3
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_2
15,2-1_1_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_1
28,2-1_6_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_6
34,2-1_7_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_7
36,2-1_4_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_4
41,2-1_5_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_5
47,2-1_8_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_8
55,2-1_9_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_9
106,2-1_10_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_10


In [1268]:
summary = (
    df_precise_answers.groupby("experiment_id")
    .agg(
        n_success=("success", "sum"),
        n_correct=("is_correct_norm", "sum"),
        n_unique=("normalised_response", "nunique"),
        gini_impurity=(
            "normalised_response",
            lambda s: 1 - np.sum(s.value_counts(normalize=True).to_numpy() ** 2)
        )
    )
    .reset_index()
)
summary

Unnamed: 0,experiment_id,n_success,n_correct,n_unique,gini_impurity
0,1-1,10,10,1,0.0
1,1-2,10,9,2,0.18
2,1-3,10,7,3,0.46
3,1-4,10,0,9,0.88
4,1-5,10,10,1,0.0
5,2-1,0,0,0,1.0
6,3-1,10,10,1,0.0
7,3-2,1,0,0,1.0
8,5-1,10,10,3,0.46
9,5-2,10,10,1,0.0


In [1269]:
summary.to_csv(f"results/{model}_response_diversity_summary.csv", index=False)

### Create updated hit and miss tables

In [1270]:
df_precise_answers

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm,run_id
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False,3-2_3
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_3
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True,True,1-5_6
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False,2-1_2
6,3-2_2_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False,3-2_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,1-2_5_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,True,True,1-2_5
154,3-1_10_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh,False,True,3-1_10
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0,[111],111,103,False,False,1-4_9
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh,False,True,3-1_8


In [1271]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
1-2,✅,✅,❌,✅,✅,✅,✅,✅,✅,✅,9 correct answers of 10 total answers
1-3,❌,✅,✅,✅,✅,✅,❌,✅,✅,❌,7 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
3-2,,,,,,,❌,,,,0 correct answers of 1 total answers
5-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
5-2,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers


In [1272]:
hit_miss(df_precise_answers, with_emojis=False).to_csv(f"results/hit_miss_table_{model}.csv")