In [24]:
import os
import json
import re
import pandas as pd
from collections import defaultdict

def extract_size(filename):
    """Extracts training size like 1000, 16000 from filename"""
    match = re.search(r'_(\d+)\.(json|txt)$', filename)
    return match.group(1) if match else ""

# Dictionary to hold results
all_results = {}

# Root results directory
results_dir = "results/"  # Change if needed

for lang_pair in os.listdir(results_dir):
    lang_path = os.path.join(results_dir, lang_pair)
    if not os.path.isdir(lang_path):
        continue

    # Temporary group: run_type → values
    grouped = defaultdict(dict)

    for filename in os.listdir(lang_path):
        filepath = os.path.join(lang_path, filename)
        if not (filename.endswith(".json") or filename.endswith(".txt")):
            continue

        # Determine run type and size
        size = extract_size(filename)
        if "africomet_base" in filename:
            run_type = f"africomet + baseline {size}" if size else "africomet + baseline"
        elif "africomet" in filename:
            run_type = f"africomet {size}" if size else "africomet"
        elif "random" in filename:
            run_type = f"random {size}" if size else "random"
        elif "baseline" in filename:
            run_type = "baseline"
        else:
            continue

        # === Handle JSON file ===
        if filename.endswith(".json"):
            with open(filepath, "r") as f:
                scores = json.load(f)
                grouped[run_type]["predict_bleu"] = scores.get("predict_bleu")
                grouped[run_type]["eval_bleu"] = scores.get("eval_bleu")
                grouped[run_type]["predict_chrf"] = scores.get("predict_chrf")
                grouped[run_type]["eval_chrf"] = scores.get("eval_chrf")

        # === Handle AfriCOMET score in .txt ===
        elif filename.endswith(".txt"):
            with open(filepath, "r") as f:
                lines = f.readlines()
                if lines:
                    last_line = lines[-1].strip()
                    if "score:" in last_line:
                        try:
                            score_str = last_line.split("score:")[-1].strip()
                            grouped[run_type]["africomet"] = float(score_str) * 100
                        except ValueError:
                            grouped[run_type]["africomet"] = None

    # Finalize list of rows for this language pair
    lang_results = []
    for run_type, metrics in grouped.items():
        row = {"run type": run_type}
        row.update(metrics)
        lang_results.append(row)

    all_results[lang_pair] = lang_results

# === Save to Excel ===
with pd.ExcelWriter("model_results.xlsx") as writer:
    for lang_pair, records in all_results.items():
        df = pd.DataFrame(records)
        df = df.sort_values(by="run type")
        df.to_excel(writer, sheet_name=lang_pair[:31], index=False, float_format="%.4f")


In [25]:
all_results

{'en-zu': [{'run type': 'africomet 4000',
   'predict_bleu': 3.0383,
   'eval_bleu': 3.4902,
   'predict_chrf': 26.7978,
   'eval_chrf': 27.4526,
   'africomet': 34.44},
  {'run type': 'random 1000',
   'africomet': 23.27,
   'predict_bleu': 1.7561,
   'eval_bleu': 1.8897,
   'predict_chrf': 19.5618,
   'eval_chrf': 20.7152},
  {'run type': 'africomet 16000',
   'predict_bleu': 6.614,
   'eval_bleu': 6.9138,
   'predict_chrf': 39.8583,
   'eval_chrf': 40.2587,
   'africomet': 57.410000000000004},
  {'run type': 'random 2000',
   'predict_bleu': 2.1395,
   'eval_bleu': 2.8959,
   'predict_chrf': 21.9348,
   'eval_chrf': 23.4902,
   'africomet': 27.139999999999997},
  {'run type': 'random 16000',
   'predict_bleu': 5.4113,
   'eval_bleu': 6.3626,
   'predict_chrf': 36.3241,
   'eval_chrf': 37.5336,
   'africomet': 50.519999999999996},
  {'run type': 'africomet 2000',
   'africomet': 31.03,
   'predict_bleu': 2.9101,
   'eval_bleu': 3.3998,
   'predict_chrf': 24.5707,
   'eval_chrf': 25.2

In [26]:
import os
import matplotlib.pyplot as plt

FONTSIZE = 18
params = {
         'axes.labelsize': FONTSIZE,
         'axes.titlesize': FONTSIZE,
         'xtick.labelsize':FONTSIZE,
         'legend.fontsize':FONTSIZE,
         'ytick.labelsize':FONTSIZE}

plt.rcParams.update(params)

# Metric to plot
metric = "africomet" # change with eval_bleu, predict_bleu, eval_chrf, predict_chrf, africomet

# Output folder
os.makedirs("plots", exist_ok=True)

for lang_pair, results in all_results.items():
    series = {"random": {}, "africomet": {}, "africomet + baseline": {}, "baseline": {}}

    for entry in results:
        run_type = entry["run type"]
        value = entry.get(metric)

        if run_type == "baseline":
            series["baseline"]["baseline"] = value
        else:
            for key in ["random", "africomet + baseline", "africomet"]:
                if key in run_type:
                    size = run_type.replace(key, "").strip()
                    try:
                        size = int(size)
                        series[key][size] = value
                    except ValueError:
                        continue

    for key in ["random", "africomet + baseline", "africomet"]:
        series[key] = dict(sorted(series[key].items()))

    # Check if there's anything to plot
    something_to_plot = any(len(data) > 0 for key, data in series.items() if key != "baseline")
    has_baseline = "baseline" in series and "baseline" in series["baseline"]

    if not something_to_plot and not has_baseline:
        print(f"⚠️ Skipping {lang_pair} — no data for {metric}")
        continue

    # === Plot ===
    fig, ax = plt.subplots(figsize=(8, 5), dpi=300)

    for label, data in series.items():
        if label == "baseline" and has_baseline:
            ax.axhline(y=data["baseline"], linestyle='--', color='gray', label="baseline")
        elif len(data) > 0:
            ax.plot(list(data.keys()), list(data.values()), marker='o', label=label)

    #ax.set_title(f"{lang_pair.upper()} – {metric}", fontsize=14)
    ax.set_xlabel("Training Size")
    ax.set_ylabel(metric)
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.legend()
    plt.tight_layout()

    # Save the figure
    save_path = os.path.join("plots", f"{lang_pair}_{metric}.pdf")
    fig.savefig(save_path, format="pdf", dpi=300)
    plt.close(fig)  # Free memory


In [27]:
import os
import matplotlib.pyplot as plt
import numpy as np

FONTSIZE = 16
plt.rcParams.update({
    'font.size': FONTSIZE,
    'axes.labelsize': FONTSIZE,
    'axes.titlesize': FONTSIZE,
    'xtick.labelsize': FONTSIZE,
    'ytick.labelsize': FONTSIZE,
    'legend.fontsize': FONTSIZE,
})

os.makedirs("plots", exist_ok=True)

metrics = ["predict_bleu", "predict_chrf", "africomet"]
sizes = [1000, 2000, 4000, 8000, 16000, 32000]
groups = {
    "africomet + baseline": "africomet + baseline",
    "africomet": "africomet",
    "random": "random"
}

for lang_pair, results in all_results.items():
    # Find baseline
    baseline_entry = next((r for r in results if r["run type"] == "baseline"), None)
    if not baseline_entry:
        print(f"⚠️ No baseline for {lang_pair}, skipping")
        continue

    baseline_scores = {m: baseline_entry.get(m, 0) for m in metrics}

    for group_name, pattern in groups.items():
        # Initialize
        delta_values = {m: [] for m in metrics}
        available_sizes = []

        for size in sizes:
            # Build run type string
            run_type = f"{pattern} {size}"
            entry = next((r for r in results if r["run type"] == run_type), None)
            if not entry:
                continue  # skip missing runs

            available_sizes.append(size)
            for m in metrics:
                val = entry.get(m)
                baseline = baseline_scores.get(m)
                delta = val - baseline if val is not None and baseline is not None else None
                delta_values[m].append(delta)

        if not available_sizes:
            continue  # nothing to plot for this group

        # Plotting
        x = np.arange(len(available_sizes))
        width = 0.25
        offsets = [-width, 0, width]

        fig, ax = plt.subplots(figsize=(10, 6), dpi=300)
        colors = ["tab:blue", "tab:orange", "tab:green"]

        for i, m in enumerate(metrics):
            ax.bar(x + offsets[i], delta_values[m], width, label=m, color=colors[i])

        ax.axhline(0, linestyle="--", color="black", linewidth=1)
        ax.set_xticks(x)
        ax.set_xticklabels(available_sizes)
        ax.set_xlabel("Training Size")
        ax.set_ylabel("Score Difference from Baseline")
        #ax.set_title(f"{lang_pair.upper()} – Δ from Baseline ({group_name})")
        ax.legend()
        ax.grid(axis='y', linestyle='--', alpha=0.6)

        plt.tight_layout()
        fname = f"{lang_pair}_{group_name.replace(' ', '_')}_delta.pdf"
        fig.savefig(os.path.join("plots", fname), format="pdf", dpi=300)
        plt.close(fig)

print("✅ Grouped delta plots saved in 'plots/' folder")


✅ Grouped delta plots saved in 'plots/' folder
