In [1]:
# ---- 1) Setup ----
!pip -q install sacrebleu pandas

import re
from pathlib import Path
import pandas as pd
from sacrebleu.metrics import BLEU

print("Imports ready.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hImports ready.


In [2]:
# ---- 2) Configuration ----
DATA_DIR = Path("/kaggle/input/bleu-en-zh-only")   # Change if CSVs are in another folder
GLOB_PATTERN = "translations_*_en-zh.csv"

In [3]:
# ---- 3) Helpers ----
def extract_model_name(csv_path: Path) -> str:
    """
    Extract model name from filename:
    translations_llama3-groq-8b-8192-tool-use-preview_en-zh.csv
    -> llama3-groq-8b-8192-tool-use-preview
    """
    m = re.match(r"^translations_(.+)_en-zh\.csv$", csv_path.name)
    return m.group(1) if m else csv_path.stem

def load_pairs(csv_path: Path) -> tuple[list[str], list[str], int]:
    """
    Load Reference (gold CN) and Translation (system CN) lists from CSV.
    Returns (hyps, refs, n_rows_after_clean).
    """
    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    expected_cols = {"Source", "Reference", "Translation"}
    missing = expected_cols - set(df.columns)
    if missing:
        raise ValueError(f"{csv_path.name}: Missing columns {missing}.")

    df["Reference"] = df["Reference"].astype(str).str.strip()
    df["Translation"] = df["Translation"].astype(str).str.strip()
    mask = (df["Reference"] != "") & (df["Translation"] != "")
    df = df.loc[mask].reset_index(drop=True)

    refs = df["Reference"].tolist()
    hyps = df["Translation"].tolist()
    return hyps, refs, len(df)

In [4]:
# ---- 4) BLEU computation with class API ----
# Create BLEU scorer once with Chinese tokenizer
bleu_scorer = BLEU(tokenize="zh")

csv_files = sorted(DATA_DIR.glob(GLOB_PATTERN))
if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {DATA_DIR.resolve()} matching '{GLOB_PATTERN}'.")

print(f"Found {len(csv_files)} file(s).")

results = []
for csv_path in csv_files:
    model_name = extract_model_name(csv_path)
    try:
        hyps, refs, n = load_pairs(csv_path)
        if n == 0:
            print(f"[WARN] {csv_path.name}: No valid rows after cleaning; skipping.")
            continue

        # BLEU expects refs as list of list-of-strings [[ref1, ref2, ...]]
        score = bleu_scorer.corpus_score(hyps, [refs])

        print(f"{model_name:50s}  BLEU = {score.score:6.2f}  (n={n})")
        results.append({"model": model_name, "n": n, "BLEU": score.score, "file": csv_path.name})
    except Exception as e:
        print(f"[ERROR] {csv_path.name}: {e}")

# Create summary table
summary_df = pd.DataFrame(results).sort_values(by="BLEU", ascending=False).reset_index(drop=True)
summary_df

Found 13 file(s).
OLMo-1B-0724-hf                                     BLEU =   3.91  (n=100)
Phi-3.5-mini-instruct                               BLEU =   8.95  (n=50)
Qwen2.5-0.5B-Instruct                               BLEU =   3.78  (n=100)
Qwen2.5-1.5B-Instruct                               BLEU =   3.42  (n=100)
Qwen2.5-3B-Instruct                                 BLEU =   7.33  (n=100)
gemma-7b-it                                         BLEU =  34.53  (n=100)
gemma2-9b-it                                        BLEU =  13.26  (n=100)
llama-3.1-70b-versatile                             BLEU =  34.33  (n=100)
llama-3.1-8b-instant                                BLEU =  14.72  (n=100)
llama-3.2-90b-vision-preview                        BLEU =  35.46  (n=100)
llama3-groq-70b-8192-tool-use-preview               BLEU =  39.54  (n=100)
llama3-groq-8b-8192-tool-use-preview                BLEU =  14.74  (n=100)
mixtral-8x7b-32768                                  BLEU =   5.55  (n=100)


Unnamed: 0,model,n,BLEU,file
0,llama3-groq-70b-8192-tool-use-preview,100,39.538028,translations_llama3-groq-70b-8192-tool-use-pre...
1,llama-3.2-90b-vision-preview,100,35.457182,translations_llama-3.2-90b-vision-preview_en-z...
2,gemma-7b-it,100,34.531829,translations_gemma-7b-it_en-zh.csv
3,llama-3.1-70b-versatile,100,34.330789,translations_llama-3.1-70b-versatile_en-zh.csv
4,llama3-groq-8b-8192-tool-use-preview,100,14.744533,translations_llama3-groq-8b-8192-tool-use-prev...
5,llama-3.1-8b-instant,100,14.716393,translations_llama-3.1-8b-instant_en-zh.csv
6,gemma2-9b-it,100,13.260553,translations_gemma2-9b-it_en-zh.csv
7,Phi-3.5-mini-instruct,50,8.948533,translations_Phi-3.5-mini-instruct_en-zh.csv
8,Qwen2.5-3B-Instruct,100,7.330566,translations_Qwen2.5-3B-Instruct_en-zh.csv
9,mixtral-8x7b-32768,100,5.549398,translations_mixtral-8x7b-32768_en-zh.csv
