In [None]:
!pip install -q evaluate rouge_score


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import json
import difflib
import evaluate

rouge = evaluate.load("rouge")

def string_similarity(a: str, b: str) -> float:
    return difflib.SequenceMatcher(None, a, b).ratio()

def load_jsonl(path: str):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

def eval_jsonl(path: str, ref_key: str, pred_key: str):
    print(f"\n=== Evaluating {path} ===")
    rows = load_jsonl(path)
    if not rows:
        print("File is empty.")
        return

    print("Available keys in first row:", list(rows[0].keys()))

    references  = [row[ref_key] for row in rows]
    predictions = [row[pred_key] for row in rows]

    rouge_result = rouge.compute(predictions=predictions, references=references)
    sim_scores = [string_similarity(p, r) for p, r in zip(predictions, references)]
    avg_sim = sum(sim_scores) / len(sim_scores)

    print("\nROUGE scores:")
    for k, v in rouge_result.items():
        print(f"  {k}: {v:.4f}")
    print(f"Average similarity: {avg_sim:.4f}")

    return rouge_result, avg_sim


In [None]:
eval_jsonl(
    "/content/finance_eval_results_3b.jsonl",
    ref_key="ground_truth",
    pred_key="model_answer",
)



=== Evaluating /content/finance_eval_results_3b.jsonl ===
Available keys in first row: ['question', 'ground_truth', 'model_answer']

ROUGE scores:
  rouge1: 0.2772
  rouge2: 0.0956
  rougeL: 0.1530
  rougeLsum: 0.2287
Average similarity: 0.0652


({'rouge1': np.float64(0.2771505082016702),
  'rouge2': np.float64(0.09560940342629076),
  'rougeL': np.float64(0.15295116910191997),
  'rougeLsum': np.float64(0.22872246092825926)},
 0.0651824172272246)

In [None]:
eval_jsonl(
    "/content/finance_eval_results_7b.jsonl",
    ref_key="ground_truth",
    pred_key="model_answer",
)



=== Evaluating /content/finance_eval_results_7b.jsonl ===
Available keys in first row: ['question', 'ground_truth', 'model_answer']

ROUGE scores:
  rouge1: 0.2791
  rouge2: 0.0970
  rougeL: 0.1595
  rougeLsum: 0.2337
Average similarity: 0.0680


({'rouge1': np.float64(0.2790971821398922),
  'rouge2': np.float64(0.09699281163626103),
  'rougeL': np.float64(0.1595131924702185),
  'rougeLsum': np.float64(0.23367260898005793)},
 0.06801868452379878)

In [None]:
eval_jsonl(
    "/content/qwen_3b_alpaca.jsonl",
    ref_key="reference_answer",
    pred_key="model_answer"
)



=== Evaluating /content/qwen_3b_alpaca.jsonl ===
Available keys in first row: ['question', 'reference_answer', 'model_answer', 'similarity']

ROUGE scores:
  rouge1: 0.2658
  rouge2: 0.0820
  rougeL: 0.1856
  rougeLsum: 0.2075
Average similarity: 0.1531


({'rouge1': np.float64(0.2658341809943737),
  'rouge2': np.float64(0.08195181321630535),
  'rougeL': np.float64(0.18564239689275291),
  'rougeLsum': np.float64(0.207457095113191)},
 0.15308032141216932)

In [None]:
eval_jsonl(
    "/content/qwen_7b_alpaca.jsonl",
    ref_key="reference_answer",
    pred_key="model_answer"
)



=== Evaluating /content/qwen_7b_alpaca.jsonl ===
Available keys in first row: ['question', 'reference_answer', 'model_answer', 'similarity']

ROUGE scores:
  rouge1: 0.3092
  rouge2: 0.1332
  rougeL: 0.2236
  rougeLsum: 0.2480
Average similarity: 0.1765


({'rouge1': np.float64(0.30919887425692344),
  'rouge2': np.float64(0.1331754045312953),
  'rougeL': np.float64(0.22362397880912516),
  'rougeLsum': np.float64(0.24804314965286012)},
 0.17647863747033082)

In [None]:
eval_jsonl(
    "/content/qwen25_3b_nofinetune.jsonl",
    ref_key="reference",
    pred_key="answer"
)



=== Evaluating /content/qwen25_3b_nofinetune.jsonl ===
Available keys in first row: ['question', 'reference', 'answer', 'similarity']

ROUGE scores:
  rouge1: 0.2592
  rouge2: 0.0822
  rougeL: 0.1686
  rougeLsum: 0.1969
Average similarity: 0.0897


({'rouge1': np.float64(0.25921773437845536),
  'rouge2': np.float64(0.082183922058264),
  'rougeL': np.float64(0.1685753311002371),
  'rougeLsum': np.float64(0.1969009799648081)},
 0.08969128642142779)