In [None]:
import pandas as pd
import json
import glob
import os
import re
from pathlib import Path
from pydracor import DraCorAPI
import plotly.express as px

## 1. Loading experiments results from JSON files to a single dataframe

In [None]:
EXPERIMENT_PREFIXES = [
    "1-1",
    "1-2",
    "1-3",
    "1-4",
    "1-5",
    "3-1",
    "4-1",
    "4-2",
    "4-3",
    "4-4",
    "5-1",
    "5-2",
    "5-3",
    "5-4",
]

In [279]:
# Path to the uploaded files
#path = "results/haiku-4-5/extracted/*.json" choose this for haiku-4-5
path = "results/sonnet-4/extracted/*.json"  # choose this for sonnet-4

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "success": success,
        "response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df = pd.DataFrame(rows)
df


Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain
0,4-4_12_extracted-info.json,4-4,True,"Based on the data from ItaDraCor, I can now an...",,"[get_corpus, get_corpus_metadata_csv]"
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play..."
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters]
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters]
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel..."
...,...,...,...,...,...,...
172,1-5_20_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play..."
173,3-2_16_extracted-info.json,3-2,False,,False,[]
174,4-3_4_extracted-info.json,4-3,True,"Based on the Swedish drama corpus data, here's...",,[get_corpus_metadata_csv]
175,3-1_11_extracted-info.json,3-1,True,"Based on the corpora information, I need to ca...",True,[get_corpora]


In [280]:
df['experiment_id'].value_counts()

experiment_id
4-4    13
1-5    13
1-1    12
3-1    12
5-2    11
4-3    11
4-1    11
1-4    11
5-4    11
3-2    11
5-3    11
1-2    10
5-1    10
4-2    10
1-3    10
2-1    10
Name: count, dtype: int64

### Basic stats on how many successful / failed runs 

(testing for 'request failure', step 1 in Henny's diagram)

In [281]:
total_attempts = df.shape[0]

In [282]:
df['success'].value_counts()

success
True     147
False     30
Name: count, dtype: int64

In [283]:
total_suscesses = df['success'].sum()

In [284]:
df[df['tool_chain'].str.len()>0].shape[0]

147

In [285]:
total_tool_chains = df[df['tool_chain'].str.len()>0].shape[0]

In [286]:
# valid True or null
df[df['valid']!=False].shape[0]

138

In [287]:
not_invalid = df[(df['valid']!=False) & (df['success']==True)].shape[0]

In [288]:
data = dict(
    number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
    stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

# color_discrete_map={
#         "Total attempts": "#636EFA",
#         "Total success (got response)": "#00CC96",
#         "Total Tool Chain Uses": "#AB63FA",
#         "Valid Responses (or open questions)": "#FFA15A"
#     }

fig = px.funnel(data, x='number', y='stage', title="Sonnet-4", 
                #color="stage", 
                #color_discrete_map=color_discrete_map
                )
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()

In [289]:
df[df['success']==True]['valid'].value_counts()

valid
True     101
False      9
Name: count, dtype: int64

## 2. Post-processing LLM responses for better automatic evaluation:

In [290]:
def extract_last_number(s):
    if s is None:
        return None
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    if not nums:
        return None
    return int(nums[-1])  # take the last one

In [291]:
df["numeric_response"] = df["response"].apply(extract_last_number)

In [292]:
df

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response
0,4-4_12_extracted-info.json,4-4,True,"Based on the data from ItaDraCor, I can now an...",,"[get_corpus, get_corpus_metadata_csv]",25.0
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0
...,...,...,...,...,...,...,...
172,1-5_20_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0
173,3-2_16_extracted-info.json,3-2,False,,False,[],
174,4-3_4_extracted-info.json,4-3,True,"Based on the Swedish drama corpus data, here's...",,[get_corpus_metadata_csv],55.0
175,3-1_11_extracted-info.json,3-1,True,"Based on the corpora information, I need to ca...",True,[get_corpora],39.0


In [293]:
def extract_all_numbers(s):
    if s is None:
        return []
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    return [int(n) for n in nums]  # convert to ints

df["all_numbers"] = df["response"].apply(extract_all_numbers)

In [294]:
df[df['experiment_id']=='1-5'][['filename', 'response', 'numeric_response', 'all_numbers']]

Unnamed: 0,filename,response,numeric_response,all_numbers
4,1-5_11_extracted-info.json,14,14.0,[14]
15,1-5_13_extracted-info.json,14,14.0,[14]
17,1-5_4_extracted-info.json,"Perfect! I can count the characters in ""Der No...",14.0,"[14, 14]"
22,1-5_12_extracted-info.json,"Perfect! I can see that ""Der Nollhart"" has 14 ...",14.0,"[14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
31,1-5_14_extracted-info.json,14,14.0,[14]
34,1-5_3_extracted-info.json,14,14.0,[14]
41,1-5_15_extracted-info.json,"Perfect! I can see from the metadata that ""Der...",14.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
56,1-5_1_extracted-info.json,"Perfect! I can count the characters in ""Der No...",14.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
58,1-5_16_extracted-info.json,14,14.0,[14]
67,1-5_17_extracted-info.json,14,14.0,[14]


In [295]:
df[df['experiment_id']=='1-5'][['response', 'numeric_response', 'all_numbers']]

Unnamed: 0,response,numeric_response,all_numbers
4,14,14.0,[14]
15,14,14.0,[14]
17,"Perfect! I can count the characters in ""Der No...",14.0,"[14, 14]"
22,"Perfect! I can see that ""Der Nollhart"" has 14 ...",14.0,"[14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13..."
31,14,14.0,[14]
34,14,14.0,[14]
41,"Perfect! I can see from the metadata that ""Der...",14.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
56,"Perfect! I can count the characters in ""Der No...",14.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
58,14,14.0,[14]
67,14,14.0,[14]


In [296]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   filename          177 non-null    object 
 1   experiment_id     177 non-null    object 
 2   success           177 non-null    bool   
 3   response          177 non-null    object 
 4   valid             140 non-null    object 
 5   tool_chain        177 non-null    object 
 6   numeric_response  111 non-null    float64
 7   all_numbers       177 non-null    object 
dtypes: bool(1), float64(1), object(6)
memory usage: 10.0+ KB


In [297]:
# stats = (
#     df_filtered.groupby("experiment_id")["numeric_response"]
#       .agg(["count", "mean", "std", "var", "min", "max"])
# )

# # add range as max-min
# stats["range"] = stats["max"] - stats["min"]

# stats

In [298]:
df.groupby("experiment_id").size()

experiment_id
1-1    12
1-2    10
1-3    10
1-4    11
1-5    13
2-1    10
3-1    12
3-2    11
4-1    11
4-2    10
4-3    11
4-4    13
5-1    10
5-2    11
5-3    11
5-4    11
dtype: int64

In [299]:
df.groupby("experiment_id")["numeric_response"].std()

experiment_id
1-1    3.812778e+01
1-2    2.366667e+01
1-3    3.522562e+01
1-4    5.334280e+00
1-5    0.000000e+00
2-1             NaN
3-1    1.549516e+01
3-2             NaN
4-1    0.000000e+00
4-2    9.600307e+02
4-3    9.688247e+02
4-4    4.741526e+02
5-1             NaN
5-2             NaN
5-3             NaN
5-4    1.745211e+14
Name: numeric_response, dtype: float64

### Normalise responses to select-the-corpus questions (3-1, 3-2)

In [300]:
# normalised response will contain the same as numeric_response for numeric questions 
# but also corpus slugs for 'which corpus' questions
df['normalised_response'] = df['numeric_response'].astype('string')
df['normalised_response'] = df['normalised_response'].str.replace('.0$', '', regex=True)

In [301]:
df['normalised_response']

0        25
1       103
2      <NA>
3      <NA>
4        14
       ... 
172      14
173    <NA>
174      55
175      39
176      20
Name: normalised_response, Length: 177, dtype: string

In [302]:
# this should all be replaced by the corpus slugs 
df[df['experiment_id'].isin(['3-1', '3-2'])]['normalised_response']


12       88
47      424
62     <NA>
133      39
136      39
137    <NA>
141    <NA>
144      39
146      39
153    <NA>
155    <NA>
156      39
157      39
159    <NA>
161      39
162    <NA>
164    <NA>
166    <NA>
167      39
169    <NA>
171    <NA>
173    <NA>
175      39
Name: normalised_response, dtype: string

In [303]:
crpra = DraCorAPI().get_corpora()

In [304]:
slugs = [corpus.name for corpus in crpra]

In [305]:
_pattern = re.compile(r'\b(?:' + '|'.join(slugs) + r')\b', flags=re.IGNORECASE)

def find_last_corpus_slug(text: str) -> str | None:
    """Return the last DraCor slug mentioned as a whole word, or None."""
    last = None
    for match in _pattern.finditer(text):
        last = match.group(0).lower()  # normalize to lowercase slug
    return last

In [306]:
mask = df['experiment_id'].isin(['3-1', '3-2'])
df.loc[mask, 'normalised_response'] = df.loc[mask, 'response'].apply(find_last_corpus_slug)

In [307]:
df[df['experiment_id'].isin(['3-1', '3-2'])][['success','response','normalised_response']]

Unnamed: 0,success,response,normalised_response
12,True,"Based on the corpus data, I need to calculate ...",gersh
47,True,Based on the data I've collected from multiple...,ger
62,False,,
133,True,"Looking at the data, I need to calculate the m...",gersh
136,True,"Based on the corpus data, I need to calculate ...",gersh
137,False,,
141,False,,
144,True,"Looking at the corpus data, I need to calculat...",gersh
146,True,"Looking at the corpora data, I need to calcula...",gersh
153,True,rom,rom


In [308]:
df[(df['experiment_id'].isin(['3-1', '3-2']) & df['success']==True)][['experiment_id','success','response','normalised_response']]

Unnamed: 0,experiment_id,success,response,normalised_response
12,3-1,True,"Based on the corpus data, I need to calculate ...",gersh
47,3-2,True,Based on the data I've collected from multiple...,ger
133,3-1,True,"Looking at the data, I need to calculate the m...",gersh
136,3-1,True,"Based on the corpus data, I need to calculate ...",gersh
144,3-1,True,"Looking at the corpus data, I need to calculat...",gersh
146,3-1,True,"Looking at the corpora data, I need to calcula...",gersh
153,3-1,True,rom,rom
156,3-1,True,"Looking at the corpus data, I need to calculat...",gersh
157,3-1,True,"Based on the corpus data, I need to calculate ...",gersh
161,3-1,True,"Looking at the corpus data, I need to calculat...",gersh


In [309]:
#df.to_csv("results/compiled_responses.csv", index=False)

## 3. Loading manually-defined correct responses

In [310]:
correct = pd.read_csv("preliminary_work/compiled_manual_answers.csv")

In [311]:
correct

Unnamed: 0,ID,Correct Answer
0,1-1,103
1,1-2,103
2,1-3,103
3,1-4,103
4,1-5,14
5,2-1,9.19
6,3-1,gersh
7,3-2,fre
8,4-1,Open question
9,4-2,Open question


In [312]:
correct_dict = dict(zip(correct["ID"], correct["Correct Answer"]))

In [313]:
df['correct_answer'] = df['experiment_id'].map(correct_dict)

In [314]:
df.head()

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
0,4-4_12_extracted-info.json,4-4,True,"Based on the data from ItaDraCor, I can now an...",,"[get_corpus, get_corpus_metadata_csv]",25.0,"[10, 30, 15, 20, 15, 16, 20, 25, 17, 18, 20, 3...",25.0,Open question
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103.0,103
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],,Emilia
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],,Marinelli
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,[14],14.0,14


In [315]:
print(df[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

  experiment_id  numeric_response correct_answer
0           4-4              25.0  Open question
1           1-2             103.0            103
2           5-2               NaN         Emilia
3           5-1               NaN      Marinelli
4           1-5              14.0             14
5           1-1             103.0            103
6           5-2               NaN         Emilia
7           1-2             103.0            103
8           4-4               8.0  Open question
9           1-1              29.0            103


In [316]:
df_strictly_numeric = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') ]

In [317]:
df_strictly_numeric.shape

(66, 10)

In [318]:
print(df_strictly_numeric[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

   experiment_id  numeric_response correct_answer
1            1-2             103.0            103
4            1-5              14.0             14
5            1-1             103.0            103
7            1-2             103.0            103
9            1-1              29.0            103
14           1-1             103.0            103
15           1-5              14.0             14
17           1-5              14.0             14
20           1-2             103.0            103
22           1-5              14.0             14


## 4. Evaluating correctness of the LLM response (hit & miss table)

In [319]:
def hit_miss(df, with_emojis=True):
    df = df.copy()
    df["is_correct"] = df["normalised_response"] == df["correct_answer"]
    df["iteration"] = df.groupby("experiment_id").cumcount() + 1

    if with_emojis:
        df["emoji"] = df["is_correct"].map({1: "✅", 0: "❌"})
        hit_table = (
            df.pivot(index="experiment_id", columns="iteration", values="emoji")
            .sort_index()
            .sort_index(axis=1)
        )
    else:
        hit_table = (
            df.pivot(index="experiment_id", columns="iteration", values="is_correct")
            .sort_index()
            .sort_index(axis=1)
            .astype("Int64")
        )

    summary = (
        df.groupby("experiment_id")["is_correct"]
        .agg(["sum", "count"])
        .assign(
            label=lambda s: s.apply(
                lambda r: f"{r['sum']} correct answers of {r['count']} total answers",
                axis=1,
            )
        )
    )
    hit_table["Summary"] = summary.loc[hit_table.index, "label"]

    overall = summary[["sum", "count"]].sum()
    hit_table.loc["All experiments", :] = None
    hit_table.loc["All experiments", "Summary"] = (
        f"{overall['sum']} correct answers of {overall['count']} total answers"
    )

    return hit_table


The version with "✅" and "❌" emojis:

In [320]:
hit_miss(df_strictly_numeric)

iteration,1,2,3,4,5,6,7,8,9,10,11,12,13,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1-1,✅,❌,✅,❌,❌,❌,✅,❌,✅,✅,❌,✅,,6 correct answers of 12 total answers
1-2,✅,✅,✅,✅,✅,✅,✅,❌,✅,,,,,8 correct answers of 9 total answers
1-3,❌,❌,✅,❌,❌,✅,❌,❌,✅,✅,,,,4 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,,,0 correct answers of 11 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,,✅,12 correct answers of 12 total answers
2-1,,,,,,,,,,,,,,0 correct answers of 0 total answers
All experiments,,,,,,,,,,,,,,30 correct answers of 54 total answers


The version with 0 and 1

In [321]:
#hit_table = hit_miss(df_strictly_numeric, with_emojis=False)
#hit_table.to_csv("hit_miss_table.csv")

What's up with 1-4? 

In [322]:
df[df['experiment_id']=='1-4']

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
32,1-4_19_extracted-info.json,1-4,True,97,True,[get_play_metadata],97.0,[97],97,103
42,1-4_18_extracted-info.json,1-4,True,95,True,[get_play_metadata],95.0,[95],95,103
71,1-4_17_extracted-info.json,1-4,True,106,True,[get_play_metadata],106.0,[106],106,103
80,1-4_16_extracted-info.json,1-4,True,91,True,[get_play_metadata],91.0,[91],91,103
94,1-4_15_extracted-info.json,1-4,True,101,True,[get_play_metadata],101.0,[101],101,103
99,1-4_14_extracted-info.json,1-4,True,100,True,[get_play_metadata],100.0,[100],100,103
110,1-4_12_extracted-info.json,1-4,True,94,True,[get_play_metadata],94.0,[94],94,103
113,1-4_13_extracted-info.json,1-4,True,91,True,[get_play_metadata],91.0,[91],91,103
125,1-4_11_extracted-info.json,1-4,True,104,True,[get_play_metadata],104.0,[104],104,103
134,1-4_20_extracted-info.json,1-4,True,105,True,[get_play_metadata],105.0,[105],105,103


## 6. Extend evaluation to 3-1, 3-2

In [323]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') ]

In [324]:
df_precise_answers

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
1,1-2_13_extracted-info.json,1-2,True,"Based on the metadata I retrieved, I can count...",False,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103
4,1-5_11_extracted-info.json,1-5,True,14,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",14.0,[14],14,14
5,1-1_14_extracted-info.json,1-1,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103
7,1-2_12_extracted-info.json,1-2,True,103,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",103.0,[103],103,103
9,1-1_15_extracted-info.json,1-1,True,29,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",29.0,[29],29,103
...,...,...,...,...,...,...,...,...,...,...
169,3-2_14_extracted-info.json,3-2,False,,False,[],,[],,fre
171,3-2_17_extracted-info.json,3-2,False,,False,[],,[],,fre
172,1-5_20_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14
173,3-2_16_extracted-info.json,3-2,False,,False,[],,[],,fre


In [325]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,11,12,13,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1-1,✅,❌,✅,❌,❌,❌,✅,❌,✅,✅,❌,✅,,6 correct answers of 12 total answers
1-2,✅,✅,✅,✅,✅,✅,✅,❌,✅,,,,,8 correct answers of 9 total answers
1-3,❌,❌,✅,❌,❌,✅,❌,❌,✅,✅,,,,4 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,,,0 correct answers of 11 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,,✅,12 correct answers of 12 total answers
2-1,,,,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,✅,✅,❌,✅,✅,✅,❌,✅,✅,,10 correct answers of 12 total answers
3-2,❌,,,,,,,,,,,,,0 correct answers of 1 total answers
All experiments,,,,,,,,,,,,,,40 correct answers of 67 total answers


In [326]:
df01 = hit_miss(df_precise_answers, with_emojis=False)
df01

iteration,1,2,3,4,5,6,7,8,9,10,11,12,13,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1-1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,,6 correct answers of 12 total answers
1-2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,,,,,8 correct answers of 9 total answers
1-3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,,,,4 correct answers of 10 total answers
1-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0 correct answers of 11 total answers
1-5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,12 correct answers of 12 total answers
2-1,,,,,,,,,,,,,,0 correct answers of 0 total answers
3-1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,,10 correct answers of 12 total answers
3-2,0.0,,,,,,,,,,,,,0 correct answers of 1 total answers
All experiments,,,,,,,,,,,,,,40 correct answers of 67 total answers


In [327]:
#df01.to_csv("results/hit_miss_table.csv")

In [328]:
hit_miss(df_precise_answers)

iteration,1,2,3,4,5,6,7,8,9,10,11,12,13,Summary
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1-1,✅,❌,✅,❌,❌,❌,✅,❌,✅,✅,❌,✅,,6 correct answers of 12 total answers
1-2,✅,✅,✅,✅,✅,✅,✅,❌,✅,,,,,8 correct answers of 9 total answers
1-3,❌,❌,✅,❌,❌,✅,❌,❌,✅,✅,,,,4 correct answers of 10 total answers
1-4,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,❌,,,0 correct answers of 11 total answers
1-5,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,✅,,✅,12 correct answers of 12 total answers
2-1,,,,,,,,,,,,,,0 correct answers of 0 total answers
3-1,✅,✅,✅,✅,✅,❌,✅,✅,✅,❌,✅,✅,,10 correct answers of 12 total answers
3-2,❌,,,,,,,,,,,,,0 correct answers of 1 total answers
All experiments,,,,,,,,,,,,,,40 correct answers of 67 total answers


In [329]:
df_precise_answers.query('success == True and normalised_response != correct_answer')[['filename','normalised_response', 
                                                                                      'correct_answer']]

Unnamed: 0,filename,normalised_response,correct_answer
9,1-1_15_extracted-info.json,29,103
23,1-1_17_extracted-info.json,29,103
30,1-1_11_extracted-info.json,31,103
32,1-4_19_extracted-info.json,97,103
42,1-4_18_extracted-info.json,95,103
47,3-2_7_extracted-info.json,ger,fre
51,1-3_19_extracted-info.json,35,103
59,1-1_13_extracted-info.json,30,103
64,1-3_18_extracted-info.json,39,103
71,1-4_17_extracted-info.json,106,103


## 7. Extend evaluation to 5- questions

In [330]:
def get_last_token_as_response(somestring):
    if not isinstance(somestring, str):
        return None
    tokens = somestring.strip().split()
    if not tokens:
        return None
    return tokens[-1]

In [331]:
mask = df['experiment_id'].str.startswith('5-')

df.loc[mask, 'normalised_response'] = (
    df.loc[mask, 'response']
      .apply(get_last_token_as_response)
      .str.title()
)

In [332]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
6,5-2_13_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
11,5-1_14_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
18,5-1_17_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
21,5-1_16_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
27,5-2_11_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
35,5-4_18_extracted-info.json,5-4,True,Foppendorf,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",,[],Foppendorf,Foppendorf
36,5-2_17_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
39,5-4_19_extracted-info.json,5-4,True,"Based on the character information, I can see ...",True,"[get_corpora, get_plays_in_corpus_by_title_hel...",523.0,"[3, 3, 895, 42, 4, 0, 5, 0, 523]",Foppendorf,Foppendorf


In [333]:
## hardcoded fix
mask = df['normalised_response']=='Prinz'
df.loc[mask, 'correct_answer'] = 'Prinz'

mask = df['normalised_response']=='Der_Prinz'
df.loc[mask, 'correct_answer'] = 'Der_Prinz'

In [334]:
df[df['experiment_id'].str.startswith('5-')]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer
2,5-2_12_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
3,5-1_15_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
6,5-2_13_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
11,5-1_14_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
18,5-1_17_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
21,5-1_16_extracted-info.json,5-1,True,Marinelli,True,[get_play_characters],,[],Marinelli,Marinelli
27,5-2_11_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
35,5-4_18_extracted-info.json,5-4,True,Foppendorf,True,"[get_corpora, get_plays_in_corpus_by_title_hel...",,[],Foppendorf,Foppendorf
36,5-2_17_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia
39,5-4_19_extracted-info.json,5-4,True,"Based on the character information, I can see ...",True,"[get_corpora, get_plays_in_corpus_by_title_hel...",523.0,"[3, 3, 895, 42, 4, 0, 5, 0, 523]",Foppendorf,Foppendorf


In [340]:
#df.to_csv("results/compiled_responses.csv", index=False)

In [336]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') |
                         df['experiment_id'].str.startswith('5-')
                         ]

In [337]:
df_precise_answers = df_precise_answers.copy()

In [338]:
## how many questions do we cover here? should be 12
df_precise_answers['experiment_id'].unique().shape[0]

12

In [264]:
total_non_open = df_precise_answers.shape[0]
total_non_open

120

In [265]:
total_non_open

120

In [266]:
non_open_success = df_precise_answers['success'].sum()
non_open_success

99

In [267]:
non_open_tool_chains = df_precise_answers[df_precise_answers['tool_chain'].str.len()>0].shape[0]
non_open_tool_chains

99

In [268]:
non_open_suc_valid = df_precise_answers[(df_precise_answers['valid']!=False) 
                                          & (df_precise_answers['success']==True)].shape[0]
non_open_suc_valid

96

In [269]:
df_precise_answers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 159
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   filename             120 non-null    object 
 1   experiment_id        120 non-null    object 
 2   success              120 non-null    bool   
 3   response             120 non-null    object 
 4   valid                120 non-null    object 
 5   tool_chain           120 non-null    object 
 6   numeric_response     68 non-null     float64
 7   all_numbers          120 non-null    object 
 8   normalised_response  98 non-null     string 
 9   correct_answer       120 non-null    object 
dtypes: bool(1), float64(1), object(7), string(1)
memory usage: 9.5+ KB


In [270]:
df_precise_answers['is correct raw'] = df_precise_answers['response'].astype(str) == df_precise_answers['correct_answer'].astype(str)
non_open_correct_raw = df_precise_answers['is correct raw'].sum()
non_open_correct_raw

58

In [271]:
# correct ones
df_precise_answers[df_precise_answers['is correct raw']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is correct raw
3,1-5_6_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
7,1-5_10_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
8,1-5_7_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
11,1-5_4_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
13,5-2_10_extracted-info.json,5-2,True,Emilia,True,[get_play_characters],,[],Emilia,Emilia,True
17,5-1_9_extracted-info.json,5-1,True,Marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli,True
19,1-5_5_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True
21,1-2_10_extracted-info.json,1-2,True,103,True,"[get_plays_in_corpus_by_title_helper, get_play...",103.0,[103],103,103,True
24,1-3_8_extracted-info.json,1-3,True,103,True,"[get_play_metadata, get_plays_in_corpus_by_tit...",103.0,[103],103,103,True
26,1-5_3_extracted-info.json,1-5,True,14,True,"[get_plays_in_corpus_by_title_helper, get_play...",14.0,[14],14,14,True


In [277]:
# wrong ones
df_precise_answers[~df_precise_answers['is correct raw'] & df_precise_answers['success']==True]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is correct raw,is_correct_norm
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127,103,False,False
22,5-1_8_extracted-info.json,5-1,True,"Based on the network metrics, the most importa...",True,"[get_play_metadata, get_play_metrics]",9.0,"[9, 9]",Marinelli,Marinelli,False,True
42,3-2_5_extracted-info.json,3-2,True,Now I need to get metadata for each corpus to ...,True,"[get_corpora, get_corpus_metadata, get_corpus_...",,[],,fre,False,False
44,5-3_9_extracted-info.json,5-3,True,"Looking at the character data, I need to count...",True,[get_spoken_text_by_characters],,[],Prinz,Prinz,False,True
51,5-1_1_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_characters, get_play_metrics]",6.0,"[9, 0, 449, 9, 0, 449, 0, 247, 8, 0, 467, 0, 7...",Marinelli,Marinelli,False,True
60,5-3_6_extracted-info.json,5-3,True,marinelli,True,"[get_play_metadata, get_play_metrics]",,[],Marinelli,Marinelli,False,True
64,5-1_3_extracted-info.json,5-1,True,"Based on the network metrics, Marinelli has th...",True,"[get_play_metadata, get_play_metrics]",383.0,"[9, 0, 247, 9, 0, 247, 0, 449, 8, 0, 467, 0, 3...",Marinelli,Marinelli,False,True
67,5-1_5_extracted-info.json,5-1,True,Der Prinz,True,[get_play_characters],,[],Prinz,Prinz,False,True
69,1-3_5_extracted-info.json,1-3,True,111,True,[get_play_metadata],111.0,[111],111,103,False,False
75,5-3_10_extracted-info.json,5-3,True,"Looking at the spoken text data, I need to cou...",True,"[get_play_metadata, get_spoken_text_by_charact...",,[],Prinz,Prinz,False,True


In [273]:
check_norm = df_precise_answers['normalised_response'].astype(str) == df_precise_answers['correct_answer'].astype(str)
df_precise_answers['is_correct_norm'] = check_norm
non_open_correct_norm = df_precise_answers['is_correct_norm'].sum()
non_open_correct_norm

84

In [274]:
# mismatch of the normalised answer with the correct on (so, REALLY wrong)
df_precise_answers[~df_precise_answers['is_correct_norm']]

Unnamed: 0,filename,experiment_id,success,response,valid,tool_chain,numeric_response,all_numbers,normalised_response,correct_answer,is correct raw,is_correct_norm
0,3-2_3_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
1,2-1_3_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
5,2-1_2_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
6,3-2_2_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
12,1-3_9_extracted-info.json,1-3,True,127,True,[get_play_metadata],127.0,[127],127.0,103,False,False
15,2-1_1_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
18,3-2_1_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
28,2-1_6_extracted-info.json,2-1,False,,False,[],,[],,9.19,False,False
29,3-2_6_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False
33,3-2_7_extracted-info.json,3-2,False,,False,[],,[],,fre,False,False


In [276]:
data = dict(
    number=[total_non_open, non_open_success, 
            non_open_tool_chains, non_open_suc_valid, 
            non_open_correct_raw, non_open_correct_norm
            ],
    stage=["Total attempts (non-open Q)", "Total success (got response)", 
           "Total Tool Chain Uses", "Valid Responses",
           "Correct answers (direct match)", "Correct answers (normalised match)"
           ])

# color_discrete_map={
#         "Total attempts": "#636EFA",
#         "Total success (got response)": "#00CC96",
#         "Total Tool Chain Uses": "#AB63FA",
#         "Valid Responses (or open questions)": "#FFA15A"
#     }

fig = px.funnel(data, x='number', y='stage', title="Haiku-4-5", 
                #color="stage", 
                #color_discrete_map=color_discrete_map
                )
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()