In [10]:
import re
import json
import pandas as pd

regex_word_guess = '[\d\.|-] \[.* = (.*)'
regex_firstpass = 'Prima lettura: (.*)'
regex_solution_word = "\d+ = (.*)"
regex_solution = "Soluzione: (.*)"

def parse_generation(doc_id, resp):
    try:
        word_guesses = ";".join(re.findall(regex_word_guess, resp))
    except:
        word_guesses = ""
    try:
        first_pass = re.findall(regex_firstpass, resp)[0]
    except:
        first_pass = ""
    try:
        solution_words = ";".join(re.findall(regex_solution_word, resp))
    except:
        solution_words = ""
    try:
        solution = re.findall(regex_solution, resp)[0]
    except:
        solution = ""
    return {
        "idx": doc_id,
        "word_guesses": word_guesses,
        "first_pass": first_pass,
        "solution_words": solution_words,
        "solution": solution,
    }

def parse_calamita_outputs(
    setting: str = "eureka_original",
    model: str = "llama3.1_8b",
    do_gold: bool = False
) -> pd.DataFrame:
    examples = []
    with open(f"../eureka-rebus-calamita-2024/outputs/{setting}/{model}.jsonl") as f:
        lines = f.readlines()
        for line in lines:
            entry = json.loads(line)
            ex_idx = entry["doc_id"]
            if do_gold:
                example = {
                    "idx": entry["doc_id"],
                    "word_guesses": entry["doc"]["word_guesses"],
                    "first_pass": entry["doc"]["first_pass"],
                    "solution_words": entry["doc"]["solution_words"],
                    "solution": entry["doc"]["solution"],
                }
            else:
                example = parse_generation(ex_idx, entry["filtered_resps"][0])
            examples.append(example)
        examples = sorted(examples, key=lambda x: x["idx"])
        return pd.DataFrame(examples)

In [11]:
gold_df = parse_calamita_outputs(do_gold=True)
gold_df.to_csv("calamita_gold.csv", index=False)

In [3]:
MODELS = [
    "llama3.1_8b",
    "llama3.1_70b",
    "llamantino3_8b",
    "minerva_7b"
]

SETTINGS = ["eureka_hints", "eureka_original"]

In [None]:
for model in MODELS:
    for setting in SETTINGS:
        df = parse_calamita_outputs(setting=setting, model=model)
        df.to_csv(f"../eureka-rebus-calamita-2024/outputs/{setting}/{model}_results.csv", index=False)

In [10]:
for model in MODELS:
    print("=" * 10 + f"\nEvaluating {model}")
    for setting in SETTINGS:
        print(f"{setting.upper()}")
        !python ../scripts/evaluate.py \
        --predicted_outputs ../eureka-rebus-calamita-2024/outputs/{setting}/{model}_results.csv \
        --gold_outputs ../eureka-rebus-calamita-2024/outputs/calamita_gold.csv \
        --word_frequencies ../outputs/word_frequencies_paisa.json \
        --word_frequencies_fp_train ../eureka-rebus/word_frequencies_fp_train.json \
        --word_frequencies_solution_train ../eureka-rebus/word_frequencies_solution_train.json
        print("\n")

Evaluating llama3.1_8b
EUREKA_HINTS
Word Guess Accuracy: 0.07
Word Guess Length: 0.16
Error: nan legno abete cicno pi abietina cicuta sasso cicero cicale
First Pass Exact Match:: 0.0
Solution Word Accuracy: 0.01
Solution Word Lengths: 0.09
Solution Exact Match: 0.0


EUREKA_ORIGINAL
Word Guess Accuracy: 0.09
Word Guess Length: 0.18
First Pass Exact Match:: 0.0
Solution Word Accuracy: 0.02
Solution Word Lengths: 0.17
Solution Exact Match: 0.0


Evaluating llama3.1_70b
EUREKA_HINTS
Word Guess Accuracy: 0.3
Word Guess Length: 0.49
First Pass Exact Match:: 0.04
Solution Word Accuracy: 0.06
Solution Word Lengths: 0.21
Solution Exact Match: 0.01


EUREKA_ORIGINAL
Word Guess Accuracy: 0.34
Word Guess Length: 0.42
First Pass Exact Match:: 0.07
Solution Word Accuracy: 0.08
Solution Word Lengths: 0.26
Solution Exact Match: 0.0


Evaluating llamantino3_8b
EUREKA_HINTS
Word Guess Accuracy: 0.0
Word Guess Length: 0.01
First Pass Exact Match:: 0.0
Solution Word Accuracy: 0.0
Solution Word Lengths: 0