In [None]:
import ast
import pandas as pd
import numpy as np
import json
import glob
import os
import re
from pathlib import Path
from pydracor import DraCorAPI
import plotly.express as px

## 1. Loading experiments results from JSON files to a single dataframe

In [None]:
EXPERIMENT_PREFIXES = [
    "1-1",
    "1-2",
    "1-3",
    "1-4",
    "1-5",
    "3-1",
    "4-1",
    "4-2",
    "4-3",
    "4-4",
    "5-1",
    "5-2",
    "5-3",
    "5-4",
]

In [None]:
#model = 'haiku-4-5' #choose this for haiku-4-5
model = 'sonnet-4' # choose this for sonnet-4

In [None]:
# Path to the uploaded files
path = f"../results/{model}/extracted/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "success": success,
        "response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df = pd.DataFrame(rows)
df


In [None]:
df['experiment_id'].value_counts()

### Basic stats on how many successful / failed runs 

(testing for 'request failure', step 1 in Henny's diagram)

In [None]:
total_attempts = df.shape[0]

In [None]:
df['success'].value_counts()

In [None]:
total_suscesses = df['success'].sum()

In [None]:
df[df['tool_chain'].str.len()>0].shape[0]

In [None]:
total_tool_chains = df[df['tool_chain'].str.len()>0].shape[0]

In [None]:
# valid True or null
df[df['valid']!=False].shape[0]

In [None]:
not_invalid = df[(df['valid']!=False) & (df['success']==True)].shape[0]

In [None]:
## no color settings
data = dict(
    number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
    stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

fig = px.funnel(data, x='number', y='stage', title=model.title())
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()

In [None]:
## no color settings
data = dict(
    number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
    stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

fig = px.funnel(data, x='number', y='stage', title=model.title(),
                color_discrete_sequence=["#1f2448"])
fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
fig.show()

In [None]:
# with color settings
# data = dict(
#     number=[total_attempts, total_suscesses, total_tool_chains, not_invalid],
#     stage=["Total attempts", "Total success (got response)", "Total Tool Chain Uses", "Valid Responses (or open questions)"])

# color_discrete_map={
        
#          "Total attempts": "#1f2448",
#          "Total success (got response)": "#fc9432",
#          "Total Tool Chain Uses": "#1f2448",
#          "Valid Responses (or open questions)": "#008a0e"
         
#      }

# fig = px.funnel(data, x='number', y='stage', title=model.title(), 
#                 color="stage", 
#                 color_discrete_map=color_discrete_map
#                 )
# fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks
# fig.show()

In [None]:
df[df['success']==True]['valid'].value_counts()

## 2. Post-processing LLM responses for better automatic evaluation:

In [None]:
def extract_last_number(s):
    if s is None:
        return None
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    if not nums:
        return None
    return int(nums[-1])  # take the last one

In [None]:
df["numeric_response"] = df["response"].apply(extract_last_number)

In [None]:
df

In [None]:
df

In [None]:
def extract_all_numbers(s):
    if s is None:
        return []
    # find all groups of digits
    nums = re.findall(r"\d+", str(s))
    return [int(n) for n in nums]  # convert to ints

df["all_numbers"] = df["response"].apply(extract_all_numbers)

In [None]:
df[df['experiment_id']=='1-5'][['filename', 'response', 'numeric_response', 'all_numbers']]

In [None]:
df[df['experiment_id']=='1-5'][['response', 'numeric_response', 'all_numbers']]

In [None]:
df.info()

In [None]:
# stats = (
#     df_filtered.groupby("experiment_id")["numeric_response"]
#       .agg(["count", "mean", "std", "var", "min", "max"])
# )

# # add range as max-min
# stats["range"] = stats["max"] - stats["min"]

# stats

In [None]:
df.groupby("experiment_id").size()

In [None]:
df.groupby("experiment_id")["numeric_response"].std()

### Normalise responses to select-the-corpus questions (3-1, 3-2)

In [None]:
# normalised response will contain the same as numeric_response for numeric questions 
# but also corpus slugs for 'which corpus' questions
df['normalised_response'] = df['numeric_response'].astype('string')
df['normalised_response'] = df['normalised_response'].str.replace('.0$', '', regex=True)

In [None]:
df['normalised_response']

In [None]:
# this should all be replaced by the corpus slugs 
df[df['experiment_id'].isin(['3-1', '3-2'])]['normalised_response']


In [None]:
crpra = DraCorAPI().get_corpora()

In [None]:
slugs = [corpus.name for corpus in crpra]

In [None]:
_pattern = re.compile(r'\b(?:' + '|'.join(slugs) + r')\b', flags=re.IGNORECASE)

def find_last_corpus_slug(text: str) -> str | None:
    """Return the last DraCor slug mentioned as a whole word, or None."""
    last = None
    for match in _pattern.finditer(text):
        last = match.group(0).lower()  # normalize to lowercase slug
    return last

In [None]:
mask = df['experiment_id'].isin(['3-1', '3-2'])
df.loc[mask, 'normalised_response'] = df.loc[mask, 'response'].apply(find_last_corpus_slug)

In [None]:
df[df['experiment_id'].isin(['3-1', '3-2'])][['success','response','normalised_response']]

In [None]:
df[(df['experiment_id'].isin(['3-1', '3-2']) & df['success']==True)][['experiment_id','success','response','normalised_response']]

## 3. Loading manually-defined correct responses

In [None]:
correct = pd.read_csv("../curated_data/autoEva_correct-answers.csv")

In [None]:
correct

In [None]:
print(correct)

In [None]:
correct_dict = dict(zip(correct["ID"], correct["Correct Answer"]))

In [None]:
df['correct_answer'] = df['experiment_id'].map(correct_dict)

In [None]:
df.head()

In [None]:
print(df[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

In [None]:
df_strictly_numeric = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') ]

In [None]:
df_strictly_numeric.shape

In [None]:
print(df_strictly_numeric[['experiment_id', 'numeric_response', 'correct_answer']].head(10))

In [None]:
df_strictly_numeric[df_strictly_numeric['experiment_id'] == '1-3']

## 4. Evaluating correctness of the LLM response (hit & miss table)

In [None]:
def hit_miss(df, with_emojis=True):
    df = df.copy()
    df["is_correct"] = df["normalised_response"] == df["correct_answer"]
    df["iteration"] = df.groupby("experiment_id").cumcount() + 1
    df["question_id"] = df["experiment_id"]

    if with_emojis:
        df["emoji"] = df["is_correct"].map({1: "✅", 0: "❌"})
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="emoji")
            .sort_index()
            .sort_index(axis=1)
        )
    else:
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="is_correct")
            .sort_index()
            .sort_index(axis=1)
            .astype("Int64")
        )

    summary = (
        df.groupby("question_id")["is_correct"]
        .agg(["sum", "count"])
        .assign(
            label=lambda s: s.apply(
                lambda r: f"{r['sum']} correct answers of {r['count']} total answers",
                axis=1,
            )
        )
    )
    hit_table["Summary"] = summary.loc[hit_table.index, "label"]

    overall = summary[["sum", "count"]].sum()
    hit_table.loc["All experiments", :] = None
    hit_table.loc["All experiments", "Summary"] = (
        f"{overall['sum']} correct answers of {overall['count']} total answers"
    )

    return hit_table


In [None]:
## Revised hit_miss function to handle multiple acceptable answers

import ast
import pandas as pd

def hit_miss(df, with_emojis=True):
    df = df.copy()

    def _to_answer_set(x):
        if pd.isna(x):
            return set()

        if isinstance(x, (list, tuple, set)):
            return {str(v).strip() for v in x if not pd.isna(v)}

        if isinstance(x, str):
            s = x.strip()
            if s.startswith("[") and s.endswith("]"):
                try:
                    parsed = ast.literal_eval(s)
                    if isinstance(parsed, (list, tuple, set)):
                        return {str(v).strip() for v in parsed if not pd.isna(v)}
                except (ValueError, SyntaxError):
                    pass
            return {s}

        return {str(x).strip()}

    # IDs
    df["question_id"] = df["experiment_id"]

    # Define what counts as an "answered" run:
    # - if a boolean 'success' exists, use it
    # - else infer from normalised_response being non-missing
    if "success" in df.columns:
        df["answered"] = df["success"].astype(bool)
    else:
        df["answered"] = ~pd.isna(df["normalised_response"])

    # Precompute acceptable answers per question
    acceptable = (
        df.groupby("question_id")["correct_answer"]
          .first()
          .apply(_to_answer_set)
          .to_dict()
    )

    def _is_correct_row(r):
        if not r["answered"]:
            return pd.NA  # <-- key change: non-answer stays NA (blank), not False
        ans = r["normalised_response"]
        return str(ans).strip() in acceptable.get(r["question_id"], set())

    df["is_correct"] = df.apply(_is_correct_row, axis=1)

    # Iteration numbering stays based on experiment_id (same as before)
    df["iteration"] = df.groupby("experiment_id").cumcount() + 1

    # Build table
    if with_emojis:
        df["emoji"] = df["is_correct"].map({True: "✅", False: "❌"})
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="emoji")
              .sort_index()
              .sort_index(axis=1)
        )
    else:
        hit_table = (
            df.pivot(index="question_id", columns="iteration", values="is_correct")
              .sort_index()
              .sort_index(axis=1)
              .astype("Int64")  # keeps <NA> as blank in CSV
        )

    # Summary: denominator should be ANSWERED runs only (i.e., is_correct not NA)
    summary = (
        df.groupby("question_id")["is_correct"]
          .agg(
              n_correct=lambda s: (s == True).sum(),
              n_answered=lambda s: s.notna().sum(),
          )
          .assign(
              label=lambda s: s.apply(
                  lambda r: f"{int(r['n_correct'])} correct answers of {int(r['n_answered'])} total answers",
                  axis=1,
              )
          )
    )

    hit_table["Summary"] = summary.loc[hit_table.index, "label"]

    # Overall: same denominator logic
    overall_correct = int(summary["n_correct"].sum())
    overall_answered = int(summary["n_answered"].sum())

    hit_table.loc["All experiments", :] = None
    hit_table.loc["All experiments", "Summary"] = (
        f"{overall_correct} correct answers of {overall_answered} total answers"
    )

    return hit_table


The version with "✅" and "❌" emojis:

In [None]:
hit_miss(df_strictly_numeric)

The version with 0 and 1

In [None]:
#hit_table = hit_miss(df_strictly_numeric, with_emojis=False)
#hit_table.to_csv("hit_miss_table.csv")

What's up with 1-4? 

In [None]:
df[df['experiment_id']=='1-4']

## 6. Extend evaluation to 3-1, 3-2

In [None]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') ]

In [None]:
df_precise_answers

In [None]:
hit_miss(df_precise_answers)

In [None]:
df01 = hit_miss(df_precise_answers, with_emojis=False)
df01

In [None]:
#df01.to_csv("results/hit_miss_table.csv")

In [None]:
hit_miss(df_precise_answers)

In [None]:
df_precise_answers.query('success == True and normalised_response != correct_answer')[['filename','normalised_response', 
                                                                                      'correct_answer']]

## 7. Extend evaluation to 5- questions

In [None]:
def get_last_token_as_response(somestring):
    if not isinstance(somestring, str):
        return None
    tokens = somestring.strip().split()
    if not tokens:
        return None
    return tokens[-1]

In [None]:
mask = df['experiment_id'].str.startswith('5-')

df.loc[mask, 'normalised_response'] = (
    df.loc[mask, 'response']
      .apply(get_last_token_as_response)
      .str.lower()
)

In [None]:
df[df['experiment_id'].str.startswith('5-')]

In [None]:
df[df['experiment_id'].str.startswith('5-')]

In [None]:
df.columns 

Output format for saving to csv (put 'response' as the last column because they are very long)

In [None]:
df[['filename', 'experiment_id', 'success', 'valid',
       'tool_chain', 'normalised_response', 
       'correct_answer', 'numeric_response', 'all_numbers', 'response']]

In [None]:
df[['filename', 'experiment_id', 'success', 'valid',
       'tool_chain', 'normalised_response', 
       'correct_answer', 'numeric_response', 'all_numbers', 'response']].to_csv(f"../results_analysed/tables/compiled_responses_{model}.csv", index=False)

Select only questions with non-open answers

In [None]:
df_precise_answers = df[df['experiment_id'].str.startswith('1-') | 
                         df['experiment_id'].str.startswith('2-') |
                         df['experiment_id'].str.startswith('3-') |
                         df['experiment_id'].str.startswith('5-')
                         ]

In [None]:
df_precise_answers = df_precise_answers.copy()

In [None]:
## how many questions do we cover here? should be 12
df_precise_answers['experiment_id'].unique().shape[0]

### get stats for the funnel

In [None]:
## to handle answers that have multiple acceptable options
def _to_answer_set(x):
    if pd.isna(x):
        return set()

    if isinstance(x, (list, tuple, set)):
        return {str(v).strip() for v in x if not pd.isna(v)}

    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple, set)):
                    return {str(v).strip() for v in parsed if not pd.isna(v)}
            except (ValueError, SyntaxError):
                pass
        return {s}

    return {str(x).strip()}

In [None]:
# Build acceptable-answer sets per question
answer_sets = (
    df_precise_answers
        .groupby("experiment_id")["correct_answer"]
        .first()
        .apply(_to_answer_set)
)

In [None]:
total_non_open = df_precise_answers.shape[0]
total_non_open

In [None]:
total_non_open

In [None]:
non_open_success = df_precise_answers['success'].sum()
non_open_success

In [None]:
non_open_tool_chains = df_precise_answers[df_precise_answers['tool_chain'].str.len()>0].shape[0]
non_open_tool_chains

In [None]:
non_open_suc_valid = df_precise_answers[(df_precise_answers['valid']!=False) 
                                          & (df_precise_answers['success']==True)].shape[0]
non_open_suc_valid

In [None]:
df_precise_answers.info()

In [None]:
# Apply membership test
df_precise_answers["is_correct_raw"] = df_precise_answers.apply(
    lambda r: str(r["response"]).strip().lower()
              in answer_sets.get(r["experiment_id"], set()),
    axis=1
)

In [None]:
## basic comparison
#df_precise_answers['is_correct_raw'] = df_precise_answers['response'].astype(str) == df_precise_answers['correct_answer'].astype(str)

In [None]:
non_open_correct_raw = df_precise_answers['is_correct_raw'].sum()
non_open_correct_raw

In [None]:
# correct ones
df_precise_answers[df_precise_answers['is_correct_raw']]

In [None]:
# wrong ones
df_precise_answers[~df_precise_answers['is_correct_raw'] & df_precise_answers['success']==True]

In [None]:
# wrong ones
#df_precise_answers[~df_precise_answers['is_correct_raw'] & df_precise_answers['success']==True][['filename','response','normalised_response', 'correct_answer']].to_csv(f"results/wrong_responses_{model}.csv", index=False)

In [None]:

# Apply membership test
df_precise_answers["is_correct_norm"] = df_precise_answers.apply(
    lambda r: str(r["normalised_response"]).strip().lower()
              in answer_sets.get(r["experiment_id"], set()),
    axis=1
)

In [None]:
#check_norm = df_precise_answers['normalised_response'].astype(str) == df_precise_answers['correct_answer'].astype(str)
#df_precise_answers['is_correct_norm'] = check_norm
non_open_correct_norm = df_precise_answers['is_correct_norm'].sum()
non_open_correct_norm

In [None]:
# mismatch of the normalised answer with the correct on (so, REALLY wrong)
df_precise_answers[~df_precise_answers['is_correct_norm']]

In [None]:
from IPython.display import HTML

HTML("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');
</style>
""")

fig.update_layout(
    font=dict(family="Inter, sans-serif", size=12, color="#1f2444")
)



In [None]:
data = dict(
    number=[total_non_open, non_open_success, 
            non_open_tool_chains, non_open_suc_valid, 
            non_open_correct_raw, non_open_correct_norm
            ],
    stage=["Total attempts (non-open questions)", "Total success (got response)", 
           "Total Tool Chain Uses", "Valid Responses",
           "Correct answers (direct match)", "Correct answers (direct + normalised match)"
           ])

fig = px.funnel(data, x='number', y='stage', title=model.title(),
                color_discrete_sequence=["#1f2448"])

fig.update_layout(title_font_size=14, title_x=0.5)  # optional tweaks


fig.update_layout(
    font=dict(family="Inter, sans-serif", size=14, color="#1f2444")
)

fig.write_image(f"../results_analysed/images/{model}_results_funnel.png", scale=300/96)
fig.show()

## 8. Add toolchain evaluation

Get toolchain validation data into a separate df

In [None]:
# Path to the uploaded files
path = f"../results_validated/{model}/*.json" 

rows = []

for file in glob.glob(path):
    with open(file, "r") as f:
        data = json.load(f)

    filename = os.path.basename(file)
    
    # Experiment ID is always the first part before the first "_"
    experiment_id = filename.split("_")[0]  # e.g. "1-1"

    # Run ID is always the first part before the first "_"
    run_id = filename.split("_validated")[0]  # e.g. "1-1_17"

    # Extract the `response` field (if missing, set to None)
    response = data.get("response", None)
    tool_chain = data.get("tool_chain", None)
    success = data.get("success", False)
    valid = data.get("valid", False)
    absurd_tool_ratio = data.get("absurd_tool_ratio", None)
    tool_path_length_difference = data.get("tool_path_length_difference", None)
    tool_error_rate = data.get("tool_error_rate", None)
    overall_error_rate = tool_error_rate.get("overall_error_rate")

    rows.append({
        "filename": filename,
        "experiment_id": experiment_id,
        "run_id": run_id,
        "absurd_tool_ratio": absurd_tool_ratio,
        "overall_error_rate": overall_error_rate,
        "tool_path_length_difference": tool_path_length_difference,
        "success": success,
        #"response": response,
        "valid": valid,
        "tool_chain": tool_chain,
    })

df_tool_chains = pd.DataFrame(rows)
df_tool_chains

### Tool efficiency averages for the model

In [None]:
df_tool_chains['absurd_tool_ratio'].mean()

In [None]:
df_tool_chains['absurd_tool_ratio'].value_counts()

In [None]:
df_tool_chains['overall_error_rate'].mean()

In [None]:
df_tool_chains['overall_error_rate'].value_counts()

In [None]:
df_tool_chains['tool_path_length_difference'].mean()

In [None]:
df_tool_chains['tool_path_length_difference'].value_counts()

In [None]:
df_tool_chains['tool_path_length_difference']

In [None]:
df_tool_chains.groupby('experiment_id')['overall_error_rate'].mean()

In [None]:
df

### Combine with correctness info and analyse correlation

In [None]:
df_precise_answers['run_id'] = df_precise_answers['filename'].apply(lambda x: x.split("_extracted")[0])
df_precise_answers['run_id']

In [None]:
to_merge = df_precise_answers[['run_id', 'is_correct_norm', 'is_correct_raw']]
to_merge = to_merge.rename(columns={"is_correct_raw": "is_correct_raw"})
to_merge

In [None]:
merged = (
    df_tool_chains.merge(
        to_merge,  
        on="run_id",
        how="left",  
        validate="one_to_one"
    )
)

merged.head()

In [None]:
merged.shape

In [None]:
merged[['tool_path_length_difference', 'absurd_tool_ratio', 'overall_error_rate']].mean()

In [None]:
merged.groupby('experiment_id')[['tool_path_length_difference', 'absurd_tool_ratio', 'overall_error_rate']].mean()

In [None]:
merged.groupby('experiment_id')[['tool_path_length_difference', 'absurd_tool_ratio', 'overall_error_rate']].mean().mean()

In [None]:
merged['experiment_id'].value_counts()

In [None]:
tool_use_per_exp_ID = merged.groupby('experiment_id')[['tool_path_length_difference', 'absurd_tool_ratio', 'overall_error_rate']].mean()

In [None]:
tool_use_per_exp_ID.to_csv(f"../results_analysed/tables/{model}_tool_use_per_experiment_id.csv")

In [None]:
corr = merged["overall_error_rate"].corr(merged["is_correct_norm"])
print(corr)

In [None]:
corr = merged["tool_path_length_difference"].corr(merged["is_correct_norm"])
print(corr)

In [None]:
corr = merged["absurd_tool_ratio"].corr(merged["is_correct_norm"])
print(corr)

In [None]:
df_precise_answers.groupby('experiment_id')['is_correct_norm'].mean()

In [None]:
corr = merged["overall_error_rate"].corr(merged["tool_path_length_difference"])
print(corr)

In [None]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] != True)].shape

In [None]:
merged[(merged['overall_error_rate'] == 0) & (merged['is_correct_norm'] == True)].shape

In [None]:
merged[(merged['overall_error_rate'] == 0)].shape

In [None]:
df.query('experiment_id == "4-1"')

### 2026-01-08 Variance analysis

In [None]:
df_precise_answers.query('experiment_id == "5-2" and success == True')[['filename','response','normalised_response', 'numeric_response', 'correct_answer']]

In [None]:
df_precise_answers.query('experiment_id == "5-1" and success == True')[['filename','response','normalised_response', 'numeric_response', 'correct_answer']]

In [None]:
print(df_precise_answers.query('experiment_id == "4-1" and success == True')['normalised_response'])

In [None]:
df_precise_answers.query('experiment_id == "4-1"')

In [None]:
p = df_precise_answers.query('experiment_id == "5-1" and success == True')['normalised_response'].value_counts(normalize=True)
gini = 1 - np.sum(p**2)
print(gini)

In [None]:
p = df_precise_answers.query('experiment_id == "5-2" and success == True')['normalised_response'].value_counts(normalize=True)
gini = 1 - np.sum(p**2)
print(gini)

In [None]:
print(df_precise_answers.query('success == True')[['experiment_id', 'normalised_response']].head(10))

In [None]:
gini_impurity = (
    df.groupby("experiment_id")["normalised_response"]
      .apply(lambda s: 1 - np.sum(s.value_counts(normalize=True).to_numpy() ** 2))
      .rename("gini_impurity")
      .reset_index()
)

gini_impurity

In [None]:
gini_impurity.to_csv(f"../results_analysed/tables/{model}_gini_impurity.csv", index=False)

In [None]:
df_precise_answers

In [None]:
df_precise_answers.query('experiment_id == "2-1"')

In [None]:
summary = (
    df_precise_answers.groupby("experiment_id")
    .agg(
        n_success=("success", "sum"),
        n_correct=("is_correct_norm", "sum"),
        n_unique=("normalised_response", "nunique"),
        gini_impurity=(
            "normalised_response",
            lambda s: 1 - np.sum(s.value_counts(normalize=True).to_numpy() ** 2)
        )
    )
    .reset_index()
)
summary

In [None]:
summary.to_csv(f"../results_analysed/tables/{model}_response_diversity_summary.csv", index=False)

### Create updated hit and miss tables

In [None]:
df_precise_answers

In [None]:
hit_miss(df_precise_answers)

In [None]:
hit_miss(df_precise_answers, with_emojis=False).to_csv(f"../results_analysed/tables/hit_miss_table_{model}.csv")

In [None]:
df.query('experiment_id == "4-1" and success == True').shape[0]

In [None]:
df.query('experiment_id == "2-1" and success == True').shape[0]

In [None]:
df.query('experiment_id == "3-2" and success == True').shape[0]