## Calculate metrics for task 3

In [None]:
import json
import os
import pandas as pd
import warnings

from nltk.translate.bleu_score import sentence_bleu
from rouge_score.rouge_scorer import RougeScorer

warnings.filterwarnings("ignore")

In [None]:
output_path = os.path.join(
        os.path.pardir,
        "output",
        "naive",
        "beauty-small-1398",
        "all",
        "task-3",
)

In [None]:
path_entries = [
    entry for entry in os.listdir(output_path)
    if os.path.isdir(os.path.join(output_path, entry))
]
path_entries.sort(key=lambda entry: int(entry.split("-")[1]))

In [None]:
def calculate_bleu(prediction: str, ground_truth: str) -> float:
    prediction_tokens = prediction.lower().strip().split()
    ground_truth_tokens = ground_truth.lower().strip().split()

    return sentence_bleu([ground_truth_tokens], prediction_tokens)

In [None]:
def calculate_rouge(scorer: RougeScorer, prediction: str, ground_truth: str) -> tuple[float, float, float]:
    prediction = prediction.lower().strip()
    ground_truth = ground_truth.lower().strip()

    result = scorer.score(prediction, ground_truth)

    return result["rouge1"].fmeasure, result["rouge2"].fmeasure, result["rougeL"].fmeasure

In [None]:
tasks = []
all_result = {"bleu": [], "rouge-1": [], "rouge-2": [], "rouge-l": [], "rmse": [], "mae": []}

for path_entry in path_entries:
    if not os.path.isdir(os.path.join(output_path, path_entry)):
        print(f"Skipping {path_entry}")
        continue

    print(f"Processing {path_entry}")
    tasks.append(path_entry)

    output = json.load(open(os.path.join(output_path, path_entry, "results.json"), "r", encoding="utf-8"))

    prompts = []
    pred_explanations = []
    pred_ratings = []
    gt_explanations = []
    gt_ratings = []

    for entry in output:
        prompts.append(entry["source_text"])
        prediction = entry["pred"]
        ground_truth = entry["gt"]

        try:
            # Determine if the prediction starts with a rating
            pred_rating, pred_explanation = prediction.split(", ", maxsplit=1)
            pred_rating = float(pred_rating)

        except ValueError:
            # If not, assume the rating is None
            pred_rating = None
            pred_explanation = prediction

            gt_rating = None
            gt_explanation = ground_truth

        else:
            gt_rating, gt_explanation = ground_truth.split(", ", maxsplit=1)
            gt_rating = float(gt_rating)

        pred_ratings.append(pred_rating)
        pred_explanations.append(pred_explanation)
        gt_ratings.append(gt_rating)
        gt_explanations.append(gt_explanation)

    scorer = RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    bleu = tuple(
            calculate_bleu(prediction, ground_truth) for prediction, ground_truth in
            zip(pred_explanations, gt_explanations)
    )
    rouge = tuple(
            calculate_rouge(scorer, prediction, ground_truth) for prediction, ground_truth in
            zip(pred_explanations, gt_explanations)
    )
    rouge_1, rouge_2, rouge_l = zip(*rouge)

    print(f"  BLEU: {sum(bleu) / len(bleu)}")
    print(f"  ROUGE-1: {sum(rouge_1) / len(rouge_1)}")
    print(f"  ROUGE-2: {sum(rouge_2) / len(rouge_2)}")
    print(f"  ROUGE-L: {sum(rouge_l) / len(rouge_l)}")

    all_result["bleu"].append(sum(bleu) / len(bleu))
    all_result["rouge-1"].append(sum(rouge_1) / len(rouge_1))
    all_result["rouge-2"].append(sum(rouge_2) / len(rouge_2))
    all_result["rouge-l"].append(sum(rouge_l) / len(rouge_l))

    if "3-7" in path_entry or "3-8" in path_entry:
        invalid_ratings = 0
        rmse = []
        mae = []

        for prediction, ground_truth in zip(pred_ratings, gt_ratings):
            if prediction is None or ground_truth is None:
                invalid_ratings += 1
            else:
                rmse.append((prediction - ground_truth) ** 2)
                mae.append(abs(prediction - ground_truth))

        print(f"  RMSE: {sum(rmse) / len(rmse)}")
        print(f"  MAE: {sum(mae) / len(mae)}")
        print(f"  Invalid ratings: {invalid_ratings} ({invalid_ratings / len(pred_ratings) * 100:.2f}%)")

        all_result["rmse"].append(sum(rmse) / len(rmse))
        all_result["mae"].append(sum(mae) / len(mae))

    else:
        all_result["rmse"].append(None)
        all_result["mae"].append(None)

In [None]:
all_result = pd.DataFrame(all_result, index=tasks)
all_result

In [None]:
all_result["bleu"] *= 100
all_result["rouge-1"] *= 100
all_result["rouge-2"] *= 100
all_result["rouge-l"] *= 100
all_result

In [None]:
# Add a row for the average
all_result.loc["average"] = all_result.mean()

In [None]:
all_result.to_csv(os.path.join(output_path, "metrics.csv"), index=True, index_label="task")