In [286]:
import pandas as pd
import numpy as np
import json
import glob
import os
import re
from pathlib import Path
from pydracor import DraCorAPI
import plotly.express as px

## 1. Loading experiments results from JSON files to a single dataframe

In [287]:
EXPERIMENT_PREFIXES = [
    "1-1",
    "1-2",
    "1-3",
    "1-4",
    "1-5",
    "3-1",
    "4-1",
    "4-2",
    "4-3",
    "4-4",
    "5-1",
    "5-2",
    "5-3",
    "5-4",
]

In [288]:
#model = 'haiku-4-5' #choose this for haiku-4-5
model = 'sonnet-4' # choose this for sonnet-4

In [289]:
# Path to the uploaded files
path = f"results/{model}/extracted/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "success": success,
        "response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df = pd.DataFrame(rows)
df


Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain
0,4-4_12_extracted-info.json,4-4,True,"Based on the data from ItaDraCor, I can now an...",,"[get_corpus, get_corpus_metadata_csv]"
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play..."
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters]
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters]
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel..."
...,...,...,...,...,...,...
155,3-2_17_extracted-info.json,3-2,False,,False,[]
156,1-5_20_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play..."
157,3-2_16_extracted-info.json,3-2,False,,False,[]
158,3-1_11_extracted-info.json,3-1,True,"Based on the corpora information, I need to ca...",True,[get_corpora]


In [290]:
df['experiment_id'].value_counts()

experiment_id
4-4    10
1-2    10
5-2    10
5-1    10
1-5    10
1-1    10
4-3    10
3-1    10
4-1    10
4-2    10
1-4    10
5-4    10
1-3    10
5-3    10
3-2    10
2-1    10
Name: count, dtype: int64

### Basic stats on how many successful / failed runs 

(testing for 'request failure', step 1 in Henny's diagram)

In [291]:
total_attempts = df.shape[0]

In [292]:
df['success'].value_counts()

success
True     130
False     30
Name: count, dtype: int64

In [293]:
total_suscesses = df['success'].sum()

In [294]:
df[df['tool_chain'].str.len()>0].shape[0]

130

In [295]:
total_tool_chains = df[df['tool_chain'].str.len()>0].shape[0]

In [296]:
# valid True or null
df[df['valid']!=False].shape[0]

123

In [297]:
not_invalid = df[(df['valid']!=False) & (df['success']==True)].shape[0]

In [298]:
data = dict(
    number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
    stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

# color_discrete_map={
#         "Total attempts": "#636EFA",
#         "Total success (got response)": "#00CC96",
#         "Total Tool Chain Uses": "#AB63FA",
#         "Valid Responses (or open questions)": "#FFA15A"
#     }

fig = px.funnel(data, x='number', y='stage', title=model.title(), 
                #color="stage", 
                #color_discrete_map=color_discrete_map
                )
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()

In [299]:
df[df['success']==True]['valid'].value_counts()

valid
True     91
False     7
Name: count, dtype: int64

## 2. Post-processing LLM responses for better automatic evaluation:

In [300]:
def extract_last_number(s):
    if s is None:
        return None
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    if not nums:
        return None
    return int(nums[-1])  # take the last one

In [301]:
df["numeric_response"] = df["response"].apply(extract_last_number)

In [302]:
df

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response
0,4-4_12_extracted-info.json,4-4,True,"Based on the data from ItaDraCor, I can now an...",,"[get_corpus, get_corpus_metadata_csv]",25.0
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0
...,...,...,...,...,...,...,...
155,3-2_17_extracted-info.json,3-2,False,,False,[],
156,1-5_20_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0
157,3-2_16_extracted-info.json,3-2,False,,False,[],
158,3-1_11_extracted-info.json,3-1,True,"Based on the corpora information, I need to ca...",True,[get_corpora],39.0


In [303]:
def extract_all_numbers(s):
    if s is None:
        return []
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    return [int(n) for n in nums]  # convert to ints

df["all_numbers"] = df["response"].apply(extract_all_numbers)

In [304]:
df[df['experiment_id']=='1-5'][['filename', 'response', 'numeric_response', 'all_numbers']]

Unnamed: 0,filename,response,numeric_response,all_numbers
4,1-5_11_extracted-info.json,14,14.0,[14]
15,1-5_13_extracted-info.json,14,14.0,[14]
20,1-5_12_extracted-info.json,"Perfect! I can see that ""Der Nollhart"" has 14 ...",14.0,"[14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
29,1-5_14_extracted-info.json,14,14.0,[14]
37,1-5_15_extracted-info.json,"Perfect! I can see from the metadata that ""Der...",14.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
51,1-5_16_extracted-info.json,14,14.0,[14]
60,1-5_17_extracted-info.json,14,14.0,[14]
87,1-5_18_extracted-info.json,Perfect! I can count the characters from the r...,14.0,"[14, 14]"
92,1-5_19_extracted-info.json,,,[]
156,1-5_20_extracted-info.json,14,14.0,[14]


In [305]:
df[df['experiment_id']=='1-5'][['response', 'numeric_response', 'all_numbers']]

Unnamed: 0,response,numeric_response,all_numbers
4,14,14.0,[14]
15,14,14.0,[14]
20,"Perfect! I can see that ""Der Nollhart"" has 14 ...",14.0,"[14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
29,14,14.0,[14]
37,"Perfect! I can see from the metadata that ""Der...",14.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
51,14,14.0,[14]
60,14,14.0,[14]
87,Perfect! I can count the characters from the r...,14.0,"[14, 14]"
92,,,[]
156,14,14.0,[14]


In [306]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   filename          160 non-null    object 
 1   experiment_id     160 non-null    object 
 2   success           160 non-null    bool   
 3   response          160 non-null    object 
 4   valid             128 non-null    object 
 5   tool_chain        160 non-null    object 
 6   numeric_response  97 non-null     float64
 7   all_numbers       160 non-null    object 
dtypes: bool(1), float64(1), object(6)
memory usage: 9.0+ KB


In [307]:
# stats = (
#     df_filtered.groupby("experiment_id")["numeric_response"]
#       .agg(["count", "mean", "std", "var", "min", "max"])
# )

# # add range as max-min
# stats["range"] = stats["max"] - stats["min"]

# stats

In [308]:
df.groupby("experiment_id").size()

experiment_id
1-1    10
1-2    10
1-3    10
1-4    10
1-5    10
2-1    10
3-1    10
3-2    10
4-1    10
4-2    10
4-3    10
4-4    10
5-1    10
5-2    10
5-3    10
5-4    10
dtype: int64

In [309]:
df.groupby("experiment_id")["numeric_response"].std()

experiment_id
1-1    3.783018e+01
1-2    2.366667e+01
1-3    3.522562e+01
1-4    5.621388e+00
1-5    0.000000e+00
2-1             NaN
3-1    1.732412e+01
3-2             NaN
4-1    0.000000e+00
4-2    9.600307e+02
4-3    9.798425e+02
4-4    5.355579e+02
5-1             NaN
5-2             NaN
5-3             NaN
5-4    1.745211e+14
Name: numeric_response, dtype: float64

### Normalise responses to select-the-corpus questions (3-1, 3-2)

In [310]:
# normalised response will contain the same as numeric_response for numeric questions 
# but also corpus slugs for 'which corpus' questions
df['normalised_response'] = df['numeric_response'].astype('string')
df['normalised_response'] = df['normalised_response'].str.replace('.0$', '', regex=True)

In [311]:
df['normalised_response']

0        25
1       103
2      <NA>
3      <NA>
4        14
       ... 
155    <NA>
156      14
157    <NA>
158      39
159      20
Name: normalised_response, Length: 160, dtype: string

In [312]:
# this should all be replaced by the corpus slugs 
df[df['experiment_id'].isin(['3-1', '3-2'])]['normalised_response']


12       88
55     <NA>
125    <NA>
129    <NA>
131      39
133      39
138    <NA>
140    <NA>
141      39
142      39
144    <NA>
146      39
147    <NA>
148    <NA>
150    <NA>
151      39
153    <NA>
155    <NA>
157    <NA>
158      39
Name: normalised_response, dtype: string

In [313]:
crpra = DraCorAPI().get_corpora()

In [314]:
slugs = [corpus.name for corpus in crpra]

In [315]:
_pattern = re.compile(r'\b(?:' + '|'.join(slugs) + r')\b', flags=re.IGNORECASE)

def find_last_corpus_slug(text: str) -> str | None:
    """Return the last DraCor slug mentioned as a whole word, or None."""
    last = None
    for match in _pattern.finditer(text):
        last = match.group(0).lower()  # normalize to lowercase slug
    return last

In [316]:
mask = df['experiment_id'].isin(['3-1', '3-2'])
df.loc[mask, 'normalised_response'] = df.loc[mask, 'response'].apply(find_last_corpus_slug)

In [317]:
df[df['experiment_id'].isin(['3-1', '3-2'])][['success','response','normalised_response']]

Unnamed: 0,success,response,normalised_response
12,True,"Based on the corpus data, I need to calculate ...",gersh
55,False,,
125,False,,
129,False,,
131,True,"Looking at the corpus data, I need to calculat...",gersh
133,True,"Looking at the corpora data, I need to calcula...",gersh
138,True,rom,rom
140,False,,
141,True,"Looking at the corpus data, I need to calculat...",gersh
142,True,"Based on the corpus data, I need to calculate ...",gersh


In [318]:
df[(df['experiment_id'].isin(['3-1', '3-2']) & df['success']==True)][['experiment_id','success','response','normalised_response']]

Unnamed: 0,experiment_id,success,response,normalised_response
12,3-1,True,"Based on the corpus data, I need to calculate ...",gersh
131,3-1,True,"Looking at the corpus data, I need to calculat...",gersh
133,3-1,True,"Looking at the corpora data, I need to calcula...",gersh
138,3-1,True,rom,rom
141,3-1,True,"Looking at the corpus data, I need to calculat...",gersh
142,3-1,True,"Based on the corpus data, I need to calculate ...",gersh
146,3-1,True,"Looking at the corpus data, I need to calculat...",gersh
148,3-1,True,tat,tat
151,3-1,True,"Looking at the corpus data, I need to calculat...",gersh
158,3-1,True,"Based on the corpora information, I need to ca...",gersh


In [319]:
#df.to_csv("results/compiled_responses.csv", index=False)

## 3. Loading manually-defined correct responses

In [320]:
correct = pd.read_csv("preliminary_work/compiled_manual_answers.csv")

In [321]:
correct

Unnamed: 0,ID,Correct Answer
0,1-1,103
1,1-2,103
2,1-3,103
3,1-4,103
4,1-5,14
5,2-1,9.19
6,3-1,gersh
7,3-2,fre
8,4-1,Open question
9,4-2,Open question


In [322]:
correct_dict = dict(zip(correct["ID"], correct["Correct Answer"]))

In [323]:
df['correct_answer'] = df['experiment_id'].map(correct_dict)

In [324]:
df.head()

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
0,4-4_12_extracted-info.json,4-4,True,"Based on the data from ItaDraCor, I can now an...",,"[get_corpus, get_corpus_metadata_csv]",25.0,"[10, 30, 15, 20, 15, 16, 20, 25, 17, 18, 20, 3...",25.0,Open question
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103.0,103
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],,Emilia
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],,Marinelli
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,[14],14.0,14


In [325]:
print(df[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

  experiment_id  numeric_response correct_answer
0           4-4              25.0  Open question
1           1-2             103.0            103
2           5-2               NaN         Emilia
3           5-1               NaN      Marinelli
4           1-5              14.0             14
5           1-1             103.0            103
6           5-2               NaN         Emilia
7           1-2             103.0            103
8           4-4               8.0  Open question
9           1-1              29.0            103


In [326]:
df_strictly_numeric = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') ]

In [327]:
df_strictly_numeric.shape

(60, 10)

In [328]:
print(df_strictly_numeric[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

   experiment_id  numeric_response correct_answer
1            1-2             103.0            103
4            1-5              14.0             14
5            1-1             103.0            103
7            1-2             103.0            103
9            1-1              29.0            103
14           1-1             103.0            103
15           1-5              14.0             14
18           1-2             103.0            103
20           1-5              14.0             14
21           1-1              29.0            103


In [329]:
df_strictly_numeric[df_strictly_numeric['experiment_id'] == '1-3']

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
46,1-3_19_extracted-info.json,1-3,True,35,True,"[get_plays_in_corpus_by_title_helper, get_play...",35.0,[35],35,103
57,1-3_18_extracted-info.json,1-3,True,39,True,"[get_plays_in_corpus_by_title_helper, get_play...",39.0,[39],39,103
70,1-3_15_extracted-info.json,1-3,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103
75,1-3_14_extracted-info.json,1-3,True,38,True,"[get_plays_in_corpus_by_title_helper, get_play...",38.0,[38],38,103
81,1-3_17_extracted-info.json,1-3,True,30,True,"[get_plays_in_corpus_by_title_helper, get_play...",30.0,[30],30,103
89,1-3_16_extracted-info.json,1-3,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103
102,1-3_11_extracted-info.json,1-3,True,31,True,"[get_plays_in_corpus_by_title_helper, get_play...",31.0,[31],31,103
113,1-3_12_extracted-info.json,1-3,True,37,True,"[get_plays_in_corpus_by_title_helper, get_play...",37.0,[37],37,103
117,1-3_13_extracted-info.json,1-3,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103
128,1-3_20_extracted-info.json,1-3,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103


## 4. Evaluating correctness of the LLM response (hit & miss table)

In [330]:
def hit_miss(df, with_emojis=True):
    df = df.copy()
    df["is_correct"] = df["normalised_response"] == df["correct_answer"]
    df["iteration"] = df.groupby("experiment_id").cumcount() + 1
    df["question_id"] = df["experiment_id"]

    if with_emojis:
        df["emoji"] = df["is_correct"].map({1: "✅", 0: "❌"})
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="emoji")
            .sort_index()
            .sort_index(axis=1)
        )
    else:
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="is_correct")
            .sort_index()
            .sort_index(axis=1)
            .astype("Int64")
        )

    summary = (
        df.groupby("question_id")["is_correct"]
        .agg(["sum", "count"])
        .assign(
            label=lambda s: s.apply(
                lambda r: f"{r['sum']} correct answers of {r['count']} total answers",
                axis=1,
            )
        )
    )
    hit_table["Summary"] = summary.loc[hit_table.index, "label"]

    overall = summary[["sum", "count"]].sum()
    hit_table.loc["All experiments", :] = None
    hit_table.loc["All experiments", "Summary"] = (
        f"{overall['sum']} correct answers of {overall['count']} total answers"
    )

    return hit_table


The version with "✅" and "❌" emojis:

In [331]:
hit_miss(df_strictly_numeric)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,❌,✅,❌,❌,❌,✅,✅,✅,✅,6 correct answers of 10 total answers
1-2,✅,✅,✅,✅,✅,✅,✅,❌,✅,,8 correct answers of 9 total answers
1-3,❌,❌,✅,❌,❌,✅,❌,❌,✅,✅,4 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,,✅,9 correct answers of 9 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,27 correct answers of 48 total answers


The version with 0 and 1

In [332]:
#hit_table = hit_miss(df_strictly_numeric, with_emojis=False)
#hit_table.to_csv("hit_miss_table.csv")

What's up with 1-4? 

In [333]:
df[df['experiment_id']=='1-4']

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
30,1-4_19_extracted-info.json,1-4,True,97,True,[get_play_metadata],97.0,[97],97,103
38,1-4_18_extracted-info.json,1-4,True,95,True,[get_play_metadata],95.0,[95],95,103
64,1-4_17_extracted-info.json,1-4,True,106,True,[get_play_metadata],106.0,[106],106,103
73,1-4_16_extracted-info.json,1-4,True,91,True,[get_play_metadata],91.0,[91],91,103
86,1-4_15_extracted-info.json,1-4,True,101,True,[get_play_metadata],101.0,[101],101,103
91,1-4_14_extracted-info.json,1-4,True,100,True,[get_play_metadata],100.0,[100],100,103
101,1-4_12_extracted-info.json,1-4,True,94,True,[get_play_metadata],94.0,[94],94,103
104,1-4_13_extracted-info.json,1-4,True,91,True,[get_play_metadata],91.0,[91],91,103
116,1-4_11_extracted-info.json,1-4,True,104,True,[get_play_metadata],104.0,[104],104,103
123,1-4_20_extracted-info.json,1-4,True,105,True,[get_play_metadata],105.0,[105],105,103


## 6. Extend evaluation to 3-1, 3-2

In [334]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') ]

In [335]:
df_precise_answers

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,[14],14,14
5,1-1_14_extracted-info.json,1-1,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103
7,1-2_12_extracted-info.json,1-2,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103
9,1-1_15_extracted-info.json,1-1,True,29,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",29.0,[29],29,103
...,...,...,...,...,...,...,...,...,...,...
153,3-2_14_extracted-info.json,3-2,False,,False,[],,[],,fre
155,3-2_17_extracted-info.json,3-2,False,,False,[],,[],,fre
156,1-5_20_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14
157,3-2_16_extracted-info.json,3-2,False,,False,[],,[],,fre


In [336]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,❌,✅,❌,❌,❌,✅,✅,✅,✅,6 correct answers of 10 total answers
1-2,✅,✅,✅,✅,✅,✅,✅,❌,✅,,8 correct answers of 9 total answers
1-3,❌,❌,✅,❌,❌,✅,❌,❌,✅,✅,4 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,,✅,9 correct answers of 9 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,❌,✅,✅,✅,❌,✅,✅,8 correct answers of 10 total answers
3-2,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,35 correct answers of 58 total answers


In [337]:
df01 = hit_miss(df_precise_answers, with_emojis=False)
df01

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,6 correct answers of 10 total answers
1-2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,,8 correct answers of 9 total answers
1-3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,4 correct answers of 10 total answers
1-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 correct answers of 10 total answers
1-5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,9 correct answers of 9 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,8 correct answers of 10 total answers
3-2,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,35 correct answers of 58 total answers


In [338]:
#df01.to_csv("results/hit_miss_table.csv")

In [339]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,❌,✅,❌,❌,❌,✅,✅,✅,✅,6 correct answers of 10 total answers
1-2,✅,✅,✅,✅,✅,✅,✅,❌,✅,,8 correct answers of 9 total answers
1-3,❌,❌,✅,❌,❌,✅,❌,❌,✅,✅,4 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,,✅,9 correct answers of 9 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,❌,✅,✅,✅,❌,✅,✅,8 correct answers of 10 total answers
3-2,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,35 correct answers of 58 total answers


In [340]:
df_precise_answers.query('success == True and normalised_response != correct_answer')[['filename','normalised_response', 
                                                                                      'correct_answer']]

Unnamed: 0,filename,normalised_response,correct_answer
9,1-1_15_extracted-info.json,29,103
21,1-1_17_extracted-info.json,29,103
28,1-1_11_extracted-info.json,31,103
30,1-4_19_extracted-info.json,97,103
38,1-4_18_extracted-info.json,95,103
46,1-3_19_extracted-info.json,35,103
52,1-1_13_extracted-info.json,30,103
57,1-3_18_extracted-info.json,39,103
64,1-4_17_extracted-info.json,106,103
71,1-2_18_extracted-info.json,32,103


## 7. Extend evaluation to 5- questions

In [341]:
def get_last_token_as_response(somestring):
    if not isinstance(somestring, str):
        return None
    tokens = somestring.strip().split()
    if not tokens:
        return None
    return tokens[-1]

In [342]:
mask = df['experiment_id'].str.startswith('5-')

df.loc[mask, 'normalised_response'] = (
    df.loc[mask, 'response']
      .apply(get_last_token_as_response)
      .str.title()
)

In [343]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
6,5-2_13_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
11,5-1_14_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
16,5-1_17_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
19,5-1_16_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
25,5-2_11_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
31,5-4_18_extracted-info.json,5-4,True,Foppendorf,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",,[],Foppendorf,Foppendorf
32,5-2_17_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
35,5-4_19_extracted-info.json,5-4,True,"Based on the character information, I can see ...",True,"[get_corpora, get_plays_in_corpus_by_title_hel...",523.0,"[3, 3, 895, 42, 4, 0, 5, 0, 523]",Foppendorf,Foppendorf


In [344]:
## hardcoded fix for now
mask = (df["normalised_response"] == "Prinz") & (df["experiment_id"] != "5-3")
df.loc[mask, 'correct_answer'] = 'Prinz'

mask = (df['normalised_response']=='Der_Prinz') & (df["experiment_id"] != "5-3")
df.loc[mask, 'correct_answer'] = 'Der_Prinz'

In [345]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
6,5-2_13_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
11,5-1_14_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
16,5-1_17_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
19,5-1_16_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
25,5-2_11_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
31,5-4_18_extracted-info.json,5-4,True,Foppendorf,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",,[],Foppendorf,Foppendorf
32,5-2_17_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
35,5-4_19_extracted-info.json,5-4,True,"Based on the character information, I can see ...",True,"[get_corpora, get_plays_in_corpus_by_title_hel...",523.0,"[3, 3, 895, 42, 4, 0, 5, 0, 523]",Foppendorf,Foppendorf


In [346]:
df.columns 

Index(['filename', 'experiment_id', 'success', 'response', 'valid',
       'tool_chain', 'numeric_response', 'all_numbers', 'normalised_response',
       'correct_answer'],
      dtype='object')

Output format for saving to csv (put 'response' as the last column because they are very long)

In [347]:
df[['filename', 'experiment_id', 'success', 'valid',
       'tool_chain', 'normalised_response', 
       'correct_answer', 'numeric_response', 'all_numbers', 'response']]

Unnamed: 0,filename,experiment_id,success,valid,tool_chain,normalised_response,correct_answer,numeric_response,all_numbers,response
0,4-4_12_extracted-info.json,4-4,True,,"[get_corpus, get_corpus_metadata_csv]",25,Open question,25.0,"[10, 30, 15, 20, 15, 16, 20, 25, 17, 18, 20, 3...","Based on the data from ItaDraCor, I can now an..."
1,1-2_13_extracted-info.json,1-2,True,False,"[get_plays_in_corpus_by_title_helper, get_play...",103,103,103.0,[103],"Based on the metadata I retrieved, I can count..."
2,5-2_12_extracted-info.json,5-2,True,True,[get_play_characters],Emilia,Emilia,,[],Emilia
3,5-1_15_extracted-info.json,5-1,True,True,[get_play_characters],Marinelli,Marinelli,,[],Marinelli
4,1-5_11_extracted-info.json,1-5,True,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14,14,14.0,[14],14
...,...,...,...,...,...,...,...,...,...,...
155,3-2_17_extracted-info.json,3-2,False,False,[],,fre,,[],
156,1-5_20_extracted-info.json,1-5,True,True,"[get_plays_in_corpus_by_title_helper, get_play...",14,14,14.0,[14],14
157,3-2_16_extracted-info.json,3-2,False,False,[],,fre,,[],
158,3-1_11_extracted-info.json,3-1,True,True,[get_corpora],gersh,gersh,39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...","Based on the corpora information, I need to ca..."


In [348]:
df[['filename', 'experiment_id', 'success', 'valid',
       'tool_chain', 'normalised_response', 
       'correct_answer', 'numeric_response', 'all_numbers', 'response']].to_csv(f"results/compiled_responses_{model}.csv", index=False)

Select only questions with non-open answers

In [349]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') |
                         df['experiment_id'].str.startswith('5-')
                         ]

In [350]:
df_precise_answers = df_precise_answers.copy()

In [351]:
## how many questions do we cover here? should be 12
df_precise_answers['experiment_id'].unique().shape[0]

12

In [352]:
total_non_open = df_precise_answers.shape[0]
total_non_open

120

In [353]:
total_non_open

120

In [354]:
non_open_success = df_precise_answers['success'].sum()
non_open_success

np.int64(98)

In [355]:
non_open_tool_chains = df_precise_answers[df_precise_answers['tool_chain'].str.len()>0].shape[0]
non_open_tool_chains

98

In [356]:
non_open_suc_valid = df_precise_answers[(df_precise_answers['valid']!=False) 
                                          & (df_precise_answers['success']==True)].shape[0]
non_open_suc_valid

91

In [357]:
df_precise_answers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 1 to 158
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   filename             120 non-null    object 
 1   experiment_id        120 non-null    object 
 2   success              120 non-null    bool   
 3   response             120 non-null    object 
 4   valid                120 non-null    object 
 5   tool_chain           120 non-null    object 
 6   numeric_response     65 non-null     float64
 7   all_numbers          120 non-null    object 
 8   normalised_response  98 non-null     string 
 9   correct_answer       120 non-null    object 
dtypes: bool(1), float64(1), object(7), string(1)
memory usage: 9.5+ KB


In [358]:
## basic comparison
df_precise_answers['is_correct_raw'] = df_precise_answers['response'].astype(str) == df_precise_answers['correct_answer'].astype(str)

In [359]:
non_open_correct_raw = df_precise_answers['is_correct_raw'].sum()
non_open_correct_raw

np.int64(43)

In [398]:
# correct ones
df_precise_answers[df_precise_answers['is_correct_raw']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm,run_id
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia,True,True,5-2_12
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli,True,True,5-1_15
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,[14],14,14,True,True,1-5_11
5,1-1_14_extracted-info.json,1-1,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103,True,True,1-1_14
6,5-2_13_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia,True,True,5-2_13
7,1-2_12_extracted-info.json,1-2,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103,True,True,1-2_12
11,5-1_14_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli,True,True,5-1_14
14,1-1_16_extracted-info.json,1-1,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103,True,True,1-1_16
15,1-5_13_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,[14],14,14,True,True,1-5_13
16,5-1_17_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli,True,True,5-1_17


In [361]:
# wrong ones
df_precise_answers[~df_precise_answers['is_correct_raw'] & df_precise_answers['success']==True]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,False
9,1-1_15_extracted-info.json,1-1,True,29,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",29.0,[29],29,103,False
12,3-1_20_extracted-info.json,3-1,True,"Based on the corpus data, I need to calculate ...",True,[get_corpora],88.0,"[375, 30, 12, 50, 762, 40, 19, 5, 140, 8, 17, ...",gersh,gersh,False
20,1-5_12_extracted-info.json,1-5,True,"Perfect! I can see that ""Der Nollhart"" has 14 ...",False,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,"[14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",14,14,False
21,1-1_17_extracted-info.json,1-1,True,29,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",29.0,[29],29,103,False
28,1-1_11_extracted-info.json,1-1,True,31,True,"[get_corpora, get_minimal_data_of_plays_of_cor...",31.0,[31],31,103,False
30,1-4_19_extracted-info.json,1-4,True,97,True,[get_play_metadata],97.0,[97],97,103,False
34,1-2_16_extracted-info.json,1-2,True,"Perfect! I have the character data for ""Danton...",False,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103,False
35,5-4_19_extracted-info.json,5-4,True,"Based on the character information, I can see ...",True,"[get_corpora, get_plays_in_corpus_by_title_hel...",523.0,"[3, 3, 895, 42, 4, 0, 5, 0, 523]",Foppendorf,Foppendorf,False
37,1-5_15_extracted-info.json,1-5,True,"Perfect! I can see from the metadata that ""Der...",False,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",14,14,False


In [362]:
check_norm = df_precise_answers['normalised_response'].astype(str) == df_precise_answers['correct_answer'].astype(str)
df_precise_answers['is_correct_norm'] = check_norm
non_open_correct_norm = df_precise_answers['is_correct_norm'].sum()
non_open_correct_norm

np.int64(75)

In [363]:
# mismatch of the normalised answer with the correct on (so, REALLY wrong)
df_precise_answers[~df_precise_answers['is_correct_norm']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm
9,1-1_15_extracted-info.json,1-1,True,29,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",29.0,[29],29,103,False,False
21,1-1_17_extracted-info.json,1-1,True,29,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",29.0,[29],29,103,False,False
28,1-1_11_extracted-info.json,1-1,True,31,True,"[get_corpora, get_minimal_data_of_plays_of_cor...",31.0,[31],31,103,False,False
30,1-4_19_extracted-info.json,1-4,True,97,True,[get_play_metadata],97.0,[97],97,103,False,False
38,1-4_18_extracted-info.json,1-4,True,95,True,[get_play_metadata],95.0,[95],95,103,False,False
46,1-3_19_extracted-info.json,1-3,True,35,True,"[get_plays_in_corpus_by_title_helper, get_play...",35.0,[35],35,103,False,False
52,1-1_13_extracted-info.json,1-1,True,30,True,"[get_plays_in_corpus_by_title_helper, get_play...",30.0,[30],30,103,False,False
55,3-2_20_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
57,1-3_18_extracted-info.json,1-3,True,39,True,"[get_plays_in_corpus_by_title_helper, get_play...",39.0,[39],39,103,False,False
64,1-4_17_extracted-info.json,1-4,True,106,True,[get_play_metadata],106.0,[106],106,103,False,False


In [364]:
data = dict(
    number=[total_non_open, non_open_success, 
            non_open_tool_chains, non_open_suc_valid, 
            non_open_correct_raw, non_open_correct_norm
            ],
    stage=["Total attempts (non-open Q)", "Total success (got response)", 
           "Total Tool Chain Uses", "Valid Responses",
           "Correct answers (direct match)", "Correct answers (normalised match)"
           ])

# color_discrete_map={
#         "Total attempts": "#636EFA",
#         "Total success (got response)": "#00CC96",
#         "Total Tool Chain Uses": "#AB63FA",
#         "Valid Responses (or open questions)": "#FFA15A"
#     }

fig = px.funnel(data, x='number', y='stage', title=model.title(), 
                #color="stage", 
                #color_discrete_map=color_discrete_map
                )
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.write_image(f"results/{model}_results_funnel.png")
fig.show()

## 8. Add toolchain evaluation

Get toolchain validation data into a separate df

In [365]:
# Path to the uploaded files
path = f"results_validated/{model}/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Run ID is always the first part before the first "_"
    run_id = filename.split("_validated")[0]  # e.g. "1-1_17"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)
    absurd_tool_ratio = data.get("absurd_tool_ratio", None)
    tool_path_length_difference = data.get("tool_path_length_difference", None)
    tool_error_rate = data.get("tool_error_rate", None)
    overall_error_rate = tool_error_rate.get("overall_error_rate")

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "run_id": run_id,
        "absurd_tool_ratio": absurd_tool_ratio,
        "overall_error_rate": overall_error_rate,
        "tool_path_length_difference": tool_path_length_difference,
        "success": success,
        #"response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df_tool_chains = pd.DataFrame(rows)
df_tool_chains

Unnamed: 0,filename,experiment_id,run_id,absurd_tool_ratio,overall_error_rate,tool_path_length_difference,success,valid,tool_chain
0,1-2_16_validated-tools.json,1-2,1-2_16,0.0,0.5,2,True,False,"[get_corpora, get_plays_in_corpus_by_title_hel..."
1,1-5_12_validated-tools.json,1-5,1-5_12,0.0,0.5,2,True,False,"[get_corpora, get_plays_in_corpus_by_title_hel..."
2,1-2_13_validated-tools.json,1-2,1-2_13,0.0,1.0,1,True,False,"[get_plays_in_corpus_by_title_helper, get_play..."
3,5-1_15_validated-tools.json,5-1,5-1_15,0.0,0.0,0,True,True,[get_play_characters]
4,1-5_17_validated-tools.json,1-5,1-5_17,0.0,0.5,2,True,True,"[get_corpora, get_plays_in_corpus_by_title_hel..."
...,...,...,...,...,...,...,...,...,...
125,1-5_14_validated-tools.json,1-5,1-5_14,0.0,1.0,1,True,True,"[get_plays_in_corpus_by_title_helper, get_play..."
126,1-2_15_validated-tools.json,1-2,1-2_15,0.0,0.5,2,True,True,"[get_corpora, get_plays_in_corpus_by_title_hel..."
127,1-4_19_validated-tools.json,1-4,1-4_19,0.0,1.0,0,True,True,[get_play_metadata]
128,5-1_13_validated-tools.json,5-1,5-1_13,0.0,0.0,0,True,True,[get_play_characters]


### Tool efficiency averages for the model

In [366]:
df_tool_chains['absurd_tool_ratio'].mean()

np.float64(0.0)

In [367]:
df_tool_chains['absurd_tool_ratio'].value_counts()

absurd_tool_ratio
0.0    130
Name: count, dtype: int64

In [368]:
df_tool_chains['overall_error_rate'].mean()

np.float64(0.4115384615384615)

In [369]:
df_tool_chains['overall_error_rate'].value_counts()

overall_error_rate
0.0    62
1.0    36
0.5    31
2.0     1
Name: count, dtype: int64

In [370]:
df_tool_chains['tool_path_length_difference'].mean()

np.float64(0.6538461538461539)

In [371]:
df_tool_chains['tool_path_length_difference'].value_counts()

tool_path_length_difference
0    77
2    32
1    21
Name: count, dtype: int64

In [372]:
df_tool_chains['tool_path_length_difference']

0      2
1      2
2      1
3      0
4      2
      ..
125    1
126    2
127    0
128    0
129    2
Name: tool_path_length_difference, Length: 130, dtype: int64

In [373]:
df_tool_chains.groupby('experiment_id')['overall_error_rate'].mean()

experiment_id
1-1    0.600000
1-2    0.555556
1-3    1.000000
1-4    1.000000
1-5    0.611111
3-1    0.000000
4-1    1.000000
4-2    0.400000
4-3    0.400000
4-4    0.300000
5-1    0.000000
5-2    0.000000
5-3    0.000000
5-4    0.400000
Name: overall_error_rate, dtype: float64

### Combine with correctness info and analyse correlation

In [374]:
df_precise_answers['run_id'] = df_precise_answers['filename'].apply(lambda x: x.split("_extracted")[0])
df_precise_answers['run_id']

1      1-2_13
2      5-2_12
3      5-1_15
4      1-5_11
5      1-1_14
        ...  
153    3-2_14
155    3-2_17
156    1-5_20
157    3-2_16
158    3-1_11
Name: run_id, Length: 120, dtype: object

In [375]:
to_merge = df_precise_answers[['run_id', 'is_correct_norm', 'is_correct_raw']]
to_merge = to_merge.rename(columns={"is_correct_raw": "is_correct_raw"})
to_merge

Unnamed: 0,run_id,is_correct_norm,is_correct_raw
1,1-2_13,True,False
2,5-2_12,True,True
3,5-1_15,True,True
4,1-5_11,True,True
5,1-1_14,True,True
...,...,...,...
153,3-2_14,False,False
155,3-2_17,False,False
156,1-5_20,True,True
157,3-2_16,False,False


In [376]:
merged = (
    df_tool_chains.merge(
        to_merge,  
        on="run_id",
        how="left",  
        validate="one_to_one"
    )
)

merged.head()

Unnamed: 0,filename,experiment_id,run_id,absurd_tool_ratio,overall_error_rate,tool_path_length_difference,success,valid,tool_chain,is_correct_norm,is_correct_raw
0,1-2_16_validated-tools.json,1-2,1-2_16,0.0,0.5,2,True,False,"[get_corpora, get_plays_in_corpus_by_title_hel...",True,False
1,1-5_12_validated-tools.json,1-5,1-5_12,0.0,0.5,2,True,False,"[get_corpora, get_plays_in_corpus_by_title_hel...",True,False
2,1-2_13_validated-tools.json,1-2,1-2_13,0.0,1.0,1,True,False,"[get_plays_in_corpus_by_title_helper, get_play...",True,False
3,5-1_15_validated-tools.json,5-1,5-1_15,0.0,0.0,0,True,True,[get_play_characters],True,True
4,1-5_17_validated-tools.json,1-5,1-5_17,0.0,0.5,2,True,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",True,True


In [377]:
corr = merged["overall_error_rate"].corr(merged["is_correct_norm"])
print(corr)

-0.5953382985929021


In [378]:
corr = merged["tool_path_length_difference"].corr(merged["is_correct_norm"])
print(corr)

0.10144336898521764


In [379]:
corr = merged["absurd_tool_ratio"].corr(merged["is_correct_norm"])
print(corr)

nan



invalid value encountered in divide


invalid value encountered in divide



In [380]:
df_precise_answers.groupby('experiment_id')['is_correct_norm'].mean()

experiment_id
1-1    0.6
1-2    0.8
1-3    0.4
1-4    0.0
1-5    0.9
2-1    0.0
3-1    0.8
3-2    0.0
5-1    1.0
5-2    1.0
5-3    1.0
5-4    1.0
Name: is_correct_norm, dtype: float64

In [381]:
corr = merged["overall_error_rate"].corr(merged["tool_path_length_difference"])
print(corr)

0.36750779094702535


In [382]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] != True)].shape

(22, 11)

In [383]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] == True)].shape

(40, 11)

In [384]:
merged[(merged['overall_error_rate'] == 0)].shape

(62, 11)

### 2026-01-08 Variance analysis

In [385]:
df_precise_answers.query('experiment_id == "5-2" and success == True')[['filename','response','normalised_response', 'numeric_response', 'correct_answer']]

Unnamed: 0,filename,response,normalised_response,numeric_response,correct_answer
2,5-2_12_extracted-info.json,Emilia,Emilia,,Emilia
6,5-2_13_extracted-info.json,Emilia,Emilia,,Emilia
25,5-2_11_extracted-info.json,Emilia,Emilia,,Emilia
32,5-2_17_extracted-info.json,Emilia,Emilia,,Emilia
43,5-2_16_extracted-info.json,Emilia,Emilia,,Emilia
48,5-2_15_extracted-info.json,Emilia,Emilia,,Emilia
54,5-2_14_extracted-info.json,Emilia,Emilia,,Emilia
66,5-2_19_extracted-info.json,Emilia,Emilia,,Emilia
78,5-2_18_extracted-info.json,Emilia,Emilia,,Emilia
152,5-2_20_extracted-info.json,Emilia,Emilia,,Emilia


In [386]:
print(df_precise_answers.query('experiment_id == "5-1" and success == True')['normalised_response'])

3      Marinelli
11     Marinelli
16     Marinelli
19     Marinelli
36     Marinelli
49     Marinelli
62     Marinelli
110        Prinz
114    Marinelli
145    Marinelli
Name: normalised_response, dtype: string


In [387]:
p = df_precise_answers.query('experiment_id == "5-1" and success == True')['normalised_response'].value_counts(normalize=True)
gini = 1 - np.sum(p**2)
print(gini)

0.17999999999999994


In [388]:
p = df_precise_answers.query('experiment_id == "5-2" and success == True')['normalised_response'].value_counts(normalize=True)
gini = 1 - np.sum(p**2)
print(gini)

0.0


In [389]:
print(df_precise_answers.query('success == True')[['experiment_id', 'normalised_response']].head(10))

   experiment_id normalised_response
1            1-2                 103
2            5-2              Emilia
3            5-1           Marinelli
4            1-5                  14
5            1-1                 103
6            5-2              Emilia
7            1-2                 103
9            1-1                  29
11           5-1           Marinelli
12           3-1               gersh


In [390]:
gini_impurity = (
    df.groupby("experiment_id")["normalised_response"]
      .apply(lambda s: 1 - np.sum(s.value_counts(normalize=True).to_numpy() ** 2))
      .rename("gini_impurity")
      .reset_index()
)

gini_impurity

Unnamed: 0,experiment_id,gini_impurity
0,1-1,0.58
1,1-2,0.197531
2,1-3,0.78
3,1-4,0.88
4,1-5,0.0
5,2-1,1.0
6,3-1,0.34
7,3-2,1.0
8,4-1,0.0
9,4-2,0.84


In [391]:
gini_impurity.to_csv(f"results/{model}_gini_impurity.csv", index=False)

In [392]:
summary = (
    df[df["success"]]
    .groupby("experiment_id")
    .agg(
        n_success=("success", "size"),
        n_unique=("normalised_response", "nunique"),
        gini_impurity=(
            "normalised_response",
            lambda s: 1 - np.sum(s.value_counts(normalize=True).to_numpy() ** 2)
        )
    )
    .reset_index()
)
summary

Unnamed: 0,experiment_id,n_success,n_unique,gini_impurity
0,1-1,10,4,0.58
1,1-2,9,2,0.197531
2,1-3,10,7,0.78
3,1-4,10,9,0.88
4,1-5,9,1,0.0
5,3-1,10,3,0.34
6,4-1,2,1,0.0
7,4-2,10,8,0.84
8,4-3,10,6,0.78
9,4-4,10,7,0.82


In [393]:
summary.to_csv(f"results/{model}_response_diversity_summary.csv", index=False)

### Create updated hit and miss tables

In [394]:
df_precise_answers

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm,run_id
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,False,True,1-2_13
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia,True,True,5-2_12
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli,True,True,5-1_15
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,[14],14,14,True,True,1-5_11
5,1-1_14_extracted-info.json,1-1,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103,True,True,1-1_14
...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,3-2_14_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False,3-2_14
155,3-2_17_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False,3-2_17
156,1-5_20_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True,True,1-5_20
157,3-2_16_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False,3-2_16


In [395]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,❌,✅,❌,❌,❌,✅,✅,✅,✅,6 correct answers of 10 total answers
1-2,✅,✅,✅,✅,✅,✅,✅,❌,✅,,8 correct answers of 9 total answers
1-3,❌,❌,✅,❌,❌,✅,❌,❌,✅,✅,4 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,,✅,9 correct answers of 9 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,❌,✅,✅,✅,❌,✅,✅,8 correct answers of 10 total answers
3-2,,,,,,,,,,,0 correct answers of 0 total answers
5-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
5-2,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers


In [396]:
hit_miss(df_precise_answers, with_emojis=False).to_csv(f"results/hit_miss_table_{model}.csv")