In [852]:
import pandas as pd
import json
import glob
import os
import re
from pathlib import Path
from pydracor import DraCorAPI
import plotly.express as px

## 1. Loading experiments results from JSON files to a single dataframe

In [853]:
EXPERIMENT_PREFIXES = [
    "1-1",
    "1-2",
    "1-3",
    "1-4",
    "1-5",
    "3-1",
    "4-1",
    "4-2",
    "4-3",
    "4-4",
    "5-1",
    "5-2",
    "5-3",
    "5-4",
]

In [854]:
model = 'haiku-4-5' #choose this for haiku-4-5
#model = 'sonnet-4' # choose this for sonnet-4

In [855]:
# Path to the uploaded files
path = f"results/{model}/extracted/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "success": success,
        "response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df = pd.DataFrame(rows)
df


Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain
0,3-2_3_extracted-info.json,3-2,False,,False,[]
1,2-1_3_extracted-info.json,2-1,False,,False,[]
2,4-4_6_extracted-info.json,4-4,True,Now let me calculate the percentage of female ...,,[get_corpus_metadata]
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play..."
4,4-3_10_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays sp...,,[get_corpus_metadata]
...,...,...,...,...,...,...
155,4-3_5_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays fr...,,[get_corpus_metadata]
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata]
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora]
158,4-3_4_extracted-info.json,4-3,True,Based on the metadata from the Swedish Drama C...,,"[get_corpus, get_corpus_metadata]"


In [856]:
df['experiment_id'].value_counts()

experiment_id
3-2    10
2-1    10
4-4    10
1-5    10
4-3    10
1-3    10
5-2    10
4-2    10
5-1    10
1-2    10
1-1    10
5-3    10
4-1    10
1-4    10
5-4    10
3-1    10
Name: count, dtype: int64

### Basic stats on how many successful / failed runs 

(testing for 'request failure', step 1 in Henny's diagram)

In [857]:
total_attempts = df.shape[0]

In [858]:
df['success'].value_counts()

success
True     129
False     31
Name: count, dtype: int64

In [859]:
total_suscesses = df['success'].sum()

In [860]:
df[df['tool_chain'].str.len()>0].shape[0]

129

In [861]:
total_tool_chains = df[df['tool_chain'].str.len()>0].shape[0]

In [862]:
# valid True or null
df[df['valid']!=False].shape[0]

126

In [863]:
not_invalid = df[(df['valid']!=False) & (df['success']==True)].shape[0]

In [864]:
data = dict(
    number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
    stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

# color_discrete_map={
#         "Total attempts": "#636EFA",
#         "Total success (got response)": "#00CC96",
#         "Total Tool Chain Uses": "#AB63FA",
#         "Valid Responses (or open questions)": "#FFA15A"
#     }

fig = px.funnel(data, x='number', y='stage', title=model.title(), 
                #color="stage", 
                #color_discrete_map=color_discrete_map
                )
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()

In [865]:
df[df['success']==True]['valid'].value_counts()

valid
True     96
False     3
Name: count, dtype: int64

## 2. Post-processing LLM responses for better automatic evaluation:

In [866]:
def extract_last_number(s):
    if s is None:
        return None
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    if not nums:
        return None
    return int(nums[-1])  # take the last one

In [867]:
df["numeric_response"] = df["response"].apply(extract_last_number)

In [868]:
df

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response
0,3-2_3_extracted-info.json,3-2,False,,False,[],
1,2-1_3_extracted-info.json,2-1,False,,False,[],
2,4-4_6_extracted-info.json,4-4,True,Now let me calculate the percentage of female ...,,[get_corpus_metadata],19.0
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0
4,4-3_10_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays sp...,,[get_corpus_metadata],40.0
...,...,...,...,...,...,...,...
155,4-3_5_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays fr...,,[get_corpus_metadata],1890.0
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0
158,4-3_4_extracted-info.json,4-3,True,Based on the metadata from the Swedish Drama C...,,"[get_corpus, get_corpus_metadata]",4.0


In [869]:
def extract_all_numbers(s):
    if s is None:
        return []
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    return [int(n) for n in nums]  # convert to ints

df["all_numbers"] = df["response"].apply(extract_all_numbers)

In [870]:
df[df['experiment_id']=='1-5'][['filename', 'response', 'numeric_response', 'all_numbers']]

Unnamed: 0,filename,response,numeric_response,all_numbers
3,1-5_6_extracted-info.json,14,14.0,[14]
7,1-5_10_extracted-info.json,14,14.0,[14]
8,1-5_7_extracted-info.json,14,14.0,[14]
11,1-5_4_extracted-info.json,14,14.0,[14]
19,1-5_5_extracted-info.json,14,14.0,[14]
26,1-5_3_extracted-info.json,14,14.0,[14]
30,1-5_2_extracted-info.json,14,14.0,[14]
39,1-5_1_extracted-info.json,14,14.0,[14]
73,1-5_8_extracted-info.json,14,14.0,[14]
79,1-5_9_extracted-info.json,14,14.0,[14]


In [871]:
df[df['experiment_id']=='1-5'][['response', 'numeric_response', 'all_numbers']]

Unnamed: 0,response,numeric_response,all_numbers
3,14,14.0,[14]
7,14,14.0,[14]
8,14,14.0,[14]
11,14,14.0,[14]
19,14,14.0,[14]
26,14,14.0,[14]
30,14,14.0,[14]
39,14,14.0,[14]
73,14,14.0,[14]
79,14,14.0,[14]


In [872]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   filename          160 non-null    object 
 1   experiment_id     160 non-null    object 
 2   success           160 non-null    bool   
 3   response          160 non-null    object 
 4   valid             130 non-null    object 
 5   tool_chain        160 non-null    object 
 6   numeric_response  98 non-null     float64
 7   all_numbers       160 non-null    object 
dtypes: bool(1), float64(1), object(6)
memory usage: 9.0+ KB


In [873]:
# stats = (
#     df_filtered.groupby("experiment_id")["numeric_response"]
#       .agg(["count", "mean", "std", "var", "min", "max"])
# )

# # add range as max-min
# stats["range"] = stats["max"] - stats["min"]

# stats

In [874]:
df.groupby("experiment_id").size()

experiment_id
1-1    10
1-2    10
1-3    10
1-4    10
1-5    10
2-1    10
3-1    10
3-2    10
4-1    10
4-2    10
4-3    10
4-4    10
5-1    10
5-2    10
5-3    10
5-4    10
dtype: int64

In [875]:
df.groupby("experiment_id")["numeric_response"].std()

experiment_id
1-1      0.000000
1-2      2.213594
1-3      7.774603
1-4     14.267290
1-5      0.000000
2-1           NaN
3-1      0.000000
3-2           NaN
4-1           NaN
4-2    905.015654
4-3    957.099606
4-4    151.638642
5-1    237.384884
5-2           NaN
5-3           NaN
5-4    433.390124
Name: numeric_response, dtype: float64

### Normalise responses to select-the-corpus questions (3-1, 3-2)

In [876]:
# normalised response will contain the same as numeric_response for numeric questions 
# but also corpus slugs for 'which corpus' questions
df['normalised_response'] = df['numeric_response'].astype('string')
df['normalised_response'] = df['normalised_response'].str.replace('.0$', '', regex=True)

In [877]:
df['normalised_response']

0      <NA>
1      <NA>
2        19
3        14
4        40
       ... 
155    1890
156     111
157      39
158       4
159     103
Name: normalised_response, Length: 160, dtype: string

In [878]:
# this should all be replaced by the corpus slugs 
df[df['experiment_id'].isin(['3-1', '3-2'])]['normalised_response']


0      <NA>
6      <NA>
18     <NA>
29     <NA>
33     <NA>
35     <NA>
42     <NA>
48     <NA>
53     <NA>
96       39
102      39
103      39
108      39
116      39
119      39
123      39
125    <NA>
150      39
154      39
157      39
Name: normalised_response, dtype: string

In [879]:
crpra = DraCorAPI().get_corpora()

In [880]:
slugs = [corpus.name for corpus in crpra]

In [881]:
_pattern = re.compile(r'\b(?:' + '|'.join(slugs) + r')\b', flags=re.IGNORECASE)

def find_last_corpus_slug(text: str) -> str | None:
    """Return the last DraCor slug mentioned as a whole word, or None."""
    last = None
    for match in _pattern.finditer(text):
        last = match.group(0).lower()  # normalize to lowercase slug
    return last

In [882]:
mask = df['experiment_id'].isin(['3-1', '3-2'])
df.loc[mask, 'normalised_response'] = df.loc[mask, 'response'].apply(find_last_corpus_slug)

In [883]:
df[df['experiment_id'].isin(['3-1', '3-2'])][['success','response','normalised_response']]

Unnamed: 0,success,response,normalised_response
0,False,,
6,False,,
18,False,,
29,False,,
33,False,,
35,False,,
42,True,Now I need to get metadata for each corpus to ...,
48,False,,
53,False,,
96,True,Now I'll calculate the mean number of characte...,gersh


In [884]:
df[(df['experiment_id'].isin(['3-1', '3-2']) & df['success']==True)][['experiment_id','success','response','normalised_response']]

Unnamed: 0,experiment_id,success,response,normalised_response
42,3-2,True,Now I need to get metadata for each corpus to ...,
96,3-1,True,Now I'll calculate the mean number of characte...,gersh
102,3-1,True,Now I'll calculate the mean number of characte...,gersh
103,3-1,True,Now I'll calculate the mean number of characte...,gersh
108,3-1,True,Now I'll calculate the mean number of characte...,gersh
116,3-1,True,Now I'll calculate the mean number of characte...,gersh
119,3-1,True,Now I'll calculate the mean number of characte...,gersh
123,3-1,True,Now I'll calculate the mean number of characte...,gersh
150,3-1,True,Now I need to calculate the mean number of cha...,gersh
154,3-1,True,Now I'll calculate the mean number of characte...,gersh


In [885]:
#df.to_csv("results/compiled_responses.csv", index=False)

## 3. Loading manually-defined correct responses

In [886]:
correct = pd.read_csv("preliminary_work/compiled_manual_answers.csv")

In [887]:
correct

Unnamed: 0,ID,Correct Answer
0,1-1,103
1,1-2,103
2,1-3,103
3,1-4,103
4,1-5,14
5,2-1,9.19
6,3-1,gersh
7,3-2,fre
8,4-1,Open question
9,4-2,Open question


In [888]:
correct_dict = dict(zip(correct["ID"], correct["Correct Answer"]))

In [889]:
df['correct_answer'] = df['experiment_id'].map(correct_dict)

In [890]:
df.head()

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19
2,4-4_6_extracted-info.json,4-4,True,Now let me calculate the percentage of female ...,,[get_corpus_metadata],19.0,"[18, 1475, 1500, 30, 40, 1500, 1550, 25, 35, 1...",19.0,Open question
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14.0,14
4,4-3_10_extracted-info.json,4-3,True,Based on the Swedish drama corpus (67 plays sp...,,[get_corpus_metadata],40.0,"[67, 1880, 1900, 1880, 1883, 4, 3, 1884, 8, 7,...",40.0,Open question


In [891]:
print(df[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

  experiment_id  numeric_response correct_answer
0           3-2               NaN            fre
1           2-1               NaN           9.19
2           4-4              19.0  Open question
3           1-5              14.0             14
4           4-3              40.0  Open question
5           2-1               NaN           9.19
6           3-2               NaN            fre
7           1-5              14.0             14
8           1-5              14.0             14
9           4-4              18.0  Open question


In [892]:
df_strictly_numeric = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') ]

In [893]:
df_strictly_numeric.shape

(60, 10)

In [894]:
print(df_strictly_numeric[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

   experiment_id  numeric_response correct_answer
1            2-1               NaN           9.19
3            1-5              14.0             14
5            2-1               NaN           9.19
7            1-5              14.0             14
8            1-5              14.0             14
11           1-5              14.0             14
12           1-3             127.0            103
15           2-1               NaN           9.19
19           1-5              14.0             14
21           1-2             103.0            103


In [895]:
df_strictly_numeric[df_strictly_numeric['experiment_id'] == '1-3']

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127,103
24,1-3_8_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
56,1-3_1_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
58,1-3_2_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
63,1-3_3_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
68,1-3_10_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
69,1-3_5_extracted-info.json,1-3,True,111,True,[get_play_metadata],111.0,[111],111,103
77,1-3_4_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
84,1-3_7_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103
93,1-3_6_extracted-info.json,1-3,True,111,True,[get_play_metadata],111.0,[111],111,103


## 4. Evaluating correctness of the LLM response (hit & miss table)

In [896]:
def hit_miss(df, with_emojis=True):
    df = df.copy()
    df["is_correct"] = df["normalised_response"] == df["correct_answer"]
    df["iteration"] = df.groupby("experiment_id").cumcount() + 1

    if with_emojis:
        df["emoji"] = df["is_correct"].map({1: "✅", 0: "❌"})
        hit_table = (
            df.pivot(index="experiment_id", columns="iteration", values="emoji")
            .sort_index()
            .sort_index(axis=1)
        )
    else:
        hit_table = (
            df.pivot(index="experiment_id", columns="iteration", values="is_correct")
            .sort_index()
            .sort_index(axis=1)
            .astype("Int64")
        )

    summary = (
        df.groupby("experiment_id")["is_correct"]
        .agg(["sum", "count"])
        .assign(
            label=lambda s: s.apply(
                lambda r: f"{r['sum']} correct answers of {r['count']} total answers",
                axis=1,
            )
        )
    )
    hit_table["Summary"] = summary.loc[hit_table.index, "label"]

    overall = summary[["sum", "count"]].sum()
    hit_table.loc["All experiments", :] = None
    hit_table.loc["All experiments", "Summary"] = (
        f"{overall['sum']} correct answers of {overall['count']} total answers"
    )

    return hit_table


The version with "✅" and "❌" emojis:

In [897]:
hit_miss(df_strictly_numeric)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
1-2,✅,✅,❌,✅,✅,✅,✅,✅,✅,✅,9 correct answers of 10 total answers
1-3,❌,✅,✅,✅,✅,✅,❌,✅,✅,❌,7 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,36 correct answers of 50 total answers


The version with 0 and 1

In [898]:
#hit_table = hit_miss(df_strictly_numeric, with_emojis=False)
#hit_table.to_csv("hit_miss_table.csv")

What's up with 1-4? 

In [899]:
df[df['experiment_id']=='1-4']

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
82,1-4_10_extracted-info.json,1-4,True,130,True,[get_play_metadata],130.0,[130],130,103
97,1-4_4_extracted-info.json,1-4,True,129,True,[get_play_metadata],129.0,[129],129,103
101,1-4_5_extracted-info.json,1-4,True,101,True,[get_play_metadata],101.0,[101],101,103
104,1-4_6_extracted-info.json,1-4,True,136,True,[get_play_metadata],136.0,[136],136,103
107,1-4_7_extracted-info.json,1-4,True,130,True,[get_play_metadata],130.0,[130],130,103
111,1-4_1_extracted-info.json,1-4,True,141,True,[get_play_metadata],141.0,[141],141,103
120,1-4_3_extracted-info.json,1-4,True,105,True,[get_play_metadata],105.0,[105],105,103
122,1-4_2_extracted-info.json,1-4,True,106,True,[get_play_metadata],106.0,[106],106,103
151,1-4_8_extracted-info.json,1-4,True,"Looking at the metadata, the characters array ...",False,[get_play_metadata],121.0,[121],121,103
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0,[111],111,103


## 6. Extend evaluation to 3-1, 3-2

In [900]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') ]

In [901]:
df_precise_answers

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19
6,3-2_2_extracted-info.json,3-2,False,,False,[],,[],,fre
...,...,...,...,...,...,...,...,...,...,...
153,1-2_5_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103
154,3-1_10_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh
156,1-4_9_extracted-info.json,1-4,True,111,True,[get_play_metadata],111.0,[111],111,103
157,3-1_8_extracted-info.json,3-1,True,Now I'll calculate the mean number of characte...,True,[get_corpora],39.0,"[375, 30, 12, 5, 762, 40, 19, 5, 140, 8, 17, 5...",gersh,gersh


In [902]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
1-2,✅,✅,❌,✅,✅,✅,✅,✅,✅,✅,9 correct answers of 10 total answers
1-3,❌,✅,✅,✅,✅,✅,❌,✅,✅,❌,7 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
3-2,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,46 correct answers of 60 total answers


In [903]:
df01 = hit_miss(df_precise_answers, with_emojis=False)
df01

iteration,1,2,3,4,5,6,7,8,9,10,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 correct answers of 10 total answers
1-2,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9 correct answers of 10 total answers
1-3,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,7 correct answers of 10 total answers
1-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 correct answers of 10 total answers
1-5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 correct answers of 10 total answers
3-2,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,46 correct answers of 60 total answers


In [904]:
#df01.to_csv("results/hit_miss_table.csv")

In [905]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
1-2,✅,✅,❌,✅,✅,✅,✅,✅,✅,✅,9 correct answers of 10 total answers
1-3,❌,✅,✅,✅,✅,✅,❌,✅,✅,❌,7 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,0 correct answers of 10 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
2-1,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,10 correct answers of 10 total answers
3-2,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,46 correct answers of 60 total answers


In [906]:
df_precise_answers.query('success == True and normalised_response != correct_answer')[['filename','normalised_response', 
                                                                                      'correct_answer']]

Unnamed: 0,filename,normalised_response,correct_answer
12,1-3_9_extracted-info.json,127,103
69,1-3_5_extracted-info.json,111,103
82,1-4_10_extracted-info.json,130,103
93,1-3_6_extracted-info.json,111,103
97,1-4_4_extracted-info.json,129,103
100,1-2_8_extracted-info.json,96,103
101,1-4_5_extracted-info.json,101,103
104,1-4_6_extracted-info.json,136,103
107,1-4_7_extracted-info.json,130,103
111,1-4_1_extracted-info.json,141,103


## 7. Extend evaluation to 5- questions

In [907]:
def get_last_token_as_response(somestring):
    if not isinstance(somestring, str):
        return None
    tokens = somestring.strip().split()
    if not tokens:
        return None
    return tokens[-1]

In [908]:
mask = df['experiment_id'].str.startswith('5-')

df.loc[mask, 'normalised_response'] = (
    df.loc[mask, 'response']
      .apply(get_last_token_as_response)
      .str.title()
)

In [909]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
13,5-2_10_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
17,5-1_9_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli
22,5-1_8_extracted-info.json,5-1,True,"Based on the network metrics, the most importa...",True,"[get_play_metadata, get_play_metrics]",9.0,"[9, 9]",Marinelli,Marinelli
27,5-1_10_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli
37,5-3_8_extracted-info.json,5-3,False,,False,[],,[],,Marinelli
44,5-3_9_extracted-info.json,5-3,True,"Looking at the character data, I need to count...",True,[get_spoken_text_by_characters],,[],Prinz,Marinelli
46,5-3_4_extracted-info.json,5-3,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
49,5-3_5_extracted-info.json,5-3,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli
51,5-1_1_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_characters, get_play_metrics]",6.0,"[9, 0, 449, 9, 0, 449, 0, 247, 8, 0, 467, 0, 7...",Marinelli,Marinelli
57,5-1_2_extracted-info.json,5-1,True,Marinelli,True,"[get_play_characters, get_play_metrics]",,[],Marinelli,Marinelli


In [910]:
## hardcoded fix for now
mask = (df["normalised_response"] == "Prinz") & (df["experiment_id"] != "5-3")
df.loc[mask, 'correct_answer'] = 'Prinz'

mask = (df['normalised_response']=='Der_Prinz') & (df["experiment_id"] != "5-3")
df.loc[mask, 'correct_answer'] = 'Der_Prinz'

In [911]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
13,5-2_10_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
17,5-1_9_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli
22,5-1_8_extracted-info.json,5-1,True,"Based on the network metrics, the most importa...",True,"[get_play_metadata, get_play_metrics]",9.0,"[9, 9]",Marinelli,Marinelli
27,5-1_10_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli
37,5-3_8_extracted-info.json,5-3,False,,False,[],,[],,Marinelli
44,5-3_9_extracted-info.json,5-3,True,"Looking at the character data, I need to count...",True,[get_spoken_text_by_characters],,[],Prinz,Marinelli
46,5-3_4_extracted-info.json,5-3,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
49,5-3_5_extracted-info.json,5-3,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli
51,5-1_1_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_characters, get_play_metrics]",6.0,"[9, 0, 449, 9, 0, 449, 0, 247, 8, 0, 467, 0, 7...",Marinelli,Marinelli
57,5-1_2_extracted-info.json,5-1,True,Marinelli,True,"[get_play_characters, get_play_metrics]",,[],Marinelli,Marinelli


In [912]:
#df.to_csv("results/compiled_responses.csv", index=False)

In [913]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') |
                         df['experiment_id'].str.startswith('5-')
                         ]

In [914]:
df_precise_answers = df_precise_answers.copy()

In [915]:
## how many questions do we cover here? should be 12
df_precise_answers['experiment_id'].unique().shape[0]

12

In [916]:
total_non_open = df_precise_answers.shape[0]
total_non_open

120

In [917]:
total_non_open

120

In [918]:
non_open_success = df_precise_answers['success'].sum()
non_open_success

99

In [919]:
non_open_tool_chains = df_precise_answers[df_precise_answers['tool_chain'].str.len()>0].shape[0]
non_open_tool_chains

99

In [920]:
non_open_suc_valid = df_precise_answers[(df_precise_answers['valid']!=False) 
                                          & (df_precise_answers['success']==True)].shape[0]
non_open_suc_valid

96

In [921]:
df_precise_answers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 159
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   filename             120 non-null    object 
 1   experiment_id        120 non-null    object 
 2   success              120 non-null    bool   
 3   response             120 non-null    object 
 4   valid                120 non-null    object 
 5   tool_chain           120 non-null    object 
 6   numeric_response     68 non-null     float64
 7   all_numbers          120 non-null    object 
 8   normalised_response  98 non-null     string 
 9   correct_answer       120 non-null    object 
dtypes: bool(1), float64(1), object(7), string(1)
memory usage: 9.5+ KB


In [922]:
## basic comparison
df_precise_answers['is_correct_raw'] = df_precise_answers['response'].astype(str) == df_precise_answers['correct_answer'].astype(str)

In [923]:
non_open_correct_raw = df_precise_answers['is_correct_raw'].sum()
non_open_correct_raw

58

In [924]:
# correct ones
df_precise_answers[df_precise_answers['is_correct_raw']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
7,1-5_10_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
8,1-5_7_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
11,1-5_4_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
13,5-2_10_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia,True
17,5-1_9_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli,True
19,1-5_5_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
21,1-2_10_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,True
24,1-3_8_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103,True
26,1-5_3_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True


In [925]:
# wrong ones
df_precise_answers[~df_precise_answers['is_correct_raw'] & df_precise_answers['success']==True]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127,103,False
22,5-1_8_extracted-info.json,5-1,True,"Based on the network metrics, the most importa...",True,"[get_play_metadata, get_play_metrics]",9.0,"[9, 9]",Marinelli,Marinelli,False
42,3-2_5_extracted-info.json,3-2,True,Now I need to get metadata for each corpus to ...,True,"[get_corpora, get_corpus_metadata, get_corpus_...",,[],,fre,False
44,5-3_9_extracted-info.json,5-3,True,"Looking at the character data, I need to count...",True,[get_spoken_text_by_characters],,[],Prinz,Marinelli,False
51,5-1_1_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_characters, get_play_metrics]",6.0,"[9, 0, 449, 9, 0, 449, 0, 247, 8, 0, 467, 0, 7...",Marinelli,Marinelli,False
60,5-3_6_extracted-info.json,5-3,True,marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli,False
64,5-1_3_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_metadata, get_play_metrics]",383.0,"[9, 0, 247, 9, 0, 247, 0, 449, 8, 0, 467, 0, 3...",Marinelli,Marinelli,False
67,5-1_5_extracted-info.json,5-1,True,Der Prinz,True,[get_play_characters],,[],Prinz,Prinz,False
69,1-3_5_extracted-info.json,1-3,True,111,True,[get_play_metadata],111.0,[111],111,103,False
75,5-3_10_extracted-info.json,5-3,True,"Looking at the spoken text data, I need to cou...",True,"[get_play_metadata, get_spoken_text_by_charact...",,[],Prinz,Marinelli,False


In [926]:
check_norm = df_precise_answers['normalised_response'].astype(str) == df_precise_answers['correct_answer'].astype(str)
df_precise_answers['is_correct_norm'] = check_norm
non_open_correct_norm = df_precise_answers['is_correct_norm'].sum()
non_open_correct_norm

81

In [927]:
# mismatch of the normalised answer with the correct on (so, REALLY wrong)
df_precise_answers[~df_precise_answers['is_correct_norm']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is_correct_raw,is_correct_norm
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
6,3-2_2_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127,103,False,False
15,2-1_1_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
18,3-2_1_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
28,2-1_6_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
29,3-2_6_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
33,3-2_7_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False


In [928]:
data = dict(
    number=[total_non_open, non_open_success, 
            non_open_tool_chains, non_open_suc_valid, 
            non_open_correct_raw, non_open_correct_norm
            ],
    stage=["Total attempts (non-open Q)", "Total success (got response)", 
           "Total Tool Chain Uses", "Valid Responses",
           "Correct answers (direct match)", "Correct answers (normalised match)"
           ])

# color_discrete_map={
#         "Total attempts": "#636EFA",
#         "Total success (got response)": "#00CC96",
#         "Total Tool Chain Uses": "#AB63FA",
#         "Valid Responses (or open questions)": "#FFA15A"
#     }

fig = px.funnel(data, x='number', y='stage', title=model.title(), 
                #color="stage", 
                #color_discrete_map=color_discrete_map
                )
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.write_image(f"results/{model}_results_funnel.png")
fig.show()

## 8. Add toolchain evaluation

Get toolchain validation data into a separate df

In [929]:
# Path to the uploaded files
path = f"results_validated/{model}/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Run ID is always the first part before the first "_"
    run_id = filename.split("_validated")[0]  # e.g. "1-1_17"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)
    absurd_tool_ratio = data.get("absurd_tool_ratio", None)
    tool_path_length_difference = data.get("tool_path_length_difference", None)
    tool_error_rate = data.get("tool_error_rate", None)
    overall_error_rate = tool_error_rate.get("overall_error_rate")

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "run_id": run_id,
        "absurd_tool_ratio": absurd_tool_ratio,
        "overall_error_rate": overall_error_rate,
        "tool_path_length_difference": tool_path_length_difference,
        "success": success,
        #"response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df_tool_chains = pd.DataFrame(rows)
df_tool_chains

Unnamed: 0,filename,experiment_id,run_id,absurd_tool_ratio,overall_error_rate,tool_path_length_difference,success,valid,tool_chain
0,3-1_9_validated-tools.json,3-1,3-1_9,0.0,0.0,0,True,True,[get_corpora]
1,5-1_10_validated-tools.json,5-1,5-1_10,0.0,0.5,1,True,True,"[get_play_metadata, get_play_metrics]"
2,5-2_5_validated-tools.json,5-2,5-2_5,0.0,0.0,0,True,True,[get_play_characters]
3,4-4_5_validated-tools.json,4-4,4-4_5,0.0,0.0,0,True,,[get_corpus_metadata]
4,1-2_5_validated-tools.json,1-2,1-2_5,0.0,1.0,1,True,True,"[get_plays_in_corpus_by_title_helper, get_play..."
...,...,...,...,...,...,...,...,...,...
124,5-2_3_validated-tools.json,5-2,5-2_3,0.0,0.0,0,True,True,[get_play_characters]
125,1-5_9_validated-tools.json,1-5,1-5_9,0.0,1.0,1,True,True,"[get_plays_in_corpus_by_title_helper, get_play..."
126,5-2_6_validated-tools.json,5-2,5-2_6,0.0,0.0,0,True,True,[get_play_characters]
127,4-4_6_validated-tools.json,4-4,4-4_6,0.0,0.0,0,True,,[get_corpus_metadata]


### Tool efficiency averages for the model

In [930]:
df_tool_chains['absurd_tool_ratio'].mean()

0.033591731266149866

In [931]:
df_tool_chains['absurd_tool_ratio'].value_counts()

absurd_tool_ratio
0.000000    121
0.500000      6
1.000000      1
0.333333      1
Name: count, dtype: int64

In [932]:
df_tool_chains['overall_error_rate'].mean()

0.6293425208153891

In [933]:
df_tool_chains['overall_error_rate'].value_counts()

overall_error_rate
1.000000    56
0.000000    49
0.500000    14
2.000000     9
0.185185     1
Name: count, dtype: int64

In [934]:
df_tool_chains['tool_path_length_difference'].mean()

0.689922480620155

In [935]:
df_tool_chains['tool_path_length_difference'].value_counts()

tool_path_length_difference
1    59
0    56
2    13
4     1
Name: count, dtype: int64

In [936]:
df_tool_chains['tool_path_length_difference']

0      0
1      1
2      0
3      0
4      1
      ..
124    0
125    1
126    0
127    0
128    2
Name: tool_path_length_difference, Length: 129, dtype: int64

In [937]:
df_tool_chains.groupby('experiment_id')['overall_error_rate'].mean()

experiment_id
1-1    1.000000
1-2    1.000000
1-3    1.700000
1-4    1.000000
1-5    1.000000
3-1    0.000000
3-2    0.185185
4-2    0.700000
4-3    0.400000
4-4    0.400000
5-1    0.400000
5-2    0.000000
5-3    0.437500
5-4    0.150000
Name: overall_error_rate, dtype: float64

### Combine with correctness info and analyse correlation

In [938]:
df_precise_answers['run_id'] = df_precise_answers['filename'].apply(lambda x: x.split("_extracted")[0])
df_precise_answers['run_id']

0       3-2_3
1       2-1_3
3       1-5_6
5       2-1_2
6       3-2_2
        ...  
153     1-2_5
154    3-1_10
156     1-4_9
157     3-1_8
159     1-2_4
Name: run_id, Length: 120, dtype: object

In [939]:
to_merge = df_precise_answers[['run_id', 'is_correct_norm', 'is_correct_raw']]
to_merge = to_merge.rename(columns={"is_correct_raw": "is_correct_raw"})
to_merge

Unnamed: 0,run_id,is_correct_norm,is_correct_raw
0,3-2_3,False,False
1,2-1_3,False,False
3,1-5_6,True,True
5,2-1_2,False,False
6,3-2_2,False,False
...,...,...,...
153,1-2_5,True,True
154,3-1_10,True,False
156,1-4_9,False,False
157,3-1_8,True,False


In [940]:
merged = (
    df_tool_chains.merge(
        to_merge,  
        on="run_id",
        how="left",  
        validate="one_to_one"
    )
)

merged.head()

Unnamed: 0,filename,experiment_id,run_id,absurd_tool_ratio,overall_error_rate,tool_path_length_difference,success,valid,tool_chain,is_correct_norm,is_correct_raw
0,3-1_9_validated-tools.json,3-1,3-1_9,0.0,0.0,0,True,True,[get_corpora],True,False
1,5-1_10_validated-tools.json,5-1,5-1_10,0.0,0.5,1,True,True,"[get_play_metadata, get_play_metrics]",True,True
2,5-2_5_validated-tools.json,5-2,5-2_5,0.0,0.0,0,True,True,[get_play_characters],True,True
3,4-4_5_validated-tools.json,4-4,4-4_5,0.0,0.0,0,True,,[get_corpus_metadata],,
4,1-2_5_validated-tools.json,1-2,1-2_5,0.0,1.0,1,True,True,"[get_plays_in_corpus_by_title_helper, get_play...",True,True


In [941]:
corr = merged["overall_error_rate"].corr(merged["is_correct_norm"])
print(corr)

-0.21331392291153842


In [942]:
corr = merged["tool_path_length_difference"].corr(merged["is_correct_norm"])
print(corr)

0.23171454978810413


In [943]:
corr = merged["absurd_tool_ratio"].corr(merged["is_correct_norm"])
print(corr)

-0.24509803921568618


In [944]:
df_precise_answers.groupby('experiment_id')['is_correct_norm'].mean()

experiment_id
1-1    1.0
1-2    0.9
1-3    0.7
1-4    0.0
1-5    1.0
2-1    0.0
3-1    1.0
3-2    0.0
5-1    1.0
5-2    1.0
5-3    0.5
5-4    1.0
Name: is_correct_norm, dtype: float64

In [945]:
corr = merged["overall_error_rate"].corr(merged["tool_path_length_difference"])
print(corr)

0.5969207786641283


In [946]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] != True)].shape

(17, 11)

In [947]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] == True)].shape

(32, 11)

In [948]:
merged[(merged['overall_error_rate'] == 0)].shape

(49, 11)