In [17]:
import sys
sys.path.append("..")

In [18]:
import pandas as pd
from IPython.display import display
import evaluate

In [19]:
df = pd.read_csv("../dataset/results.csv")
df.columns

Index(['url', 'raw_markdown', 'language', 'domain', 'source', 'length',
       'word_count', 'cleaned_markdown', 'summary_gt', 'summary_gt_extractive',
       'summary_random', 'summary_lsa', 'summary_text_rank', 'summary_fast',
       'summary_random_latency', 'summary_lsa_latency',
       'summary_text_rank_latency', 'summary_fast_latency'],
      dtype='object')

In [20]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [21]:
# 3. Preprocessing function
def preprocess(text: str) -> str:
    return " ".join(text.lower().split())

In [None]:
# Cell 3: Compute per-output metrics
summarizers = ['random', 'lsa', 'text_rank', 'fast']
records = []

for name in summarizers:
    print(f"Processing summarizer: {name}")
    pred_col = f"summary_{name}"
    
    # gt_col = "summary_gt"
    gt_col = "summary_gt_extractive"
    
    # Drop NaNs for evaluation
    eval_df = df.dropna(subset=[pred_col, gt_col])
    refs  = eval_df[gt_col].map(preprocess).tolist()
    preds = eval_df[pred_col].map(preprocess).tolist()
    
    # Compute BERTScore once per summarizer
    bert_scores = bertscore.compute(
        predictions=preds,
        references=refs,
        model_type="xlm-roberta-base",
        idf=False,
        batch_size=16,
    )
    
    # Now record metrics for each example
    for i, idx in enumerate(eval_df.index):
        # ROUGE on the single example
        rouge_score = rouge.compute(
            predictions=[preds[i]],
            references=[refs[i]],
            use_stemmer=True
        )
        
        records.append({
            "url":             df.loc[idx, "url"],
            "language":        df.loc[idx, "language"],
            "gt":              df.loc[idx, gt_col],
            "summary_pred":    df.loc[idx, pred_col],
            "length":          df.loc[idx, "length"],
            "summarizer":      name,
            "rouge1":          rouge_score["rouge1"],
            "rouge2":          rouge_score["rouge2"],
            "rougeL":          rouge_score["rougeL"],
            "rougeLsum":       rouge_score["rougeLsum"],
            "bert_precision":  bert_scores["precision"][i],
            "bert_recall":     bert_scores["recall"][i],
            "bert_f1":         bert_scores["f1"][i],
        })


Processing summarizer: random
Processing summarizer: lsa
Processing summarizer: text_rank
Processing summarizer: fast
Processing summarizer: summary_gt


KeyError: ['summary_summary_gt']

In [23]:
# Build the per-example metrics DataFrame
long_df = pd.DataFrame.from_records(records)

In [24]:
# Cell 4: Save and display
long_df.to_csv("../dataset/metrics_by_example.csv", index=False)
display("Per-example evaluation metrics", long_df)
print("✅ Saved per-example metrics to '../dataset/metrics_by_example.csv'")

'Per-example evaluation metrics'

Unnamed: 0,url,language,gt,summary_pred,length,summarizer,rouge1,rouge2,rougeL,rougeLsum,bert_precision,bert_recall,bert_f1
0,https://simple.wikipedia.org/wiki/Space_explor...,english,Space exploration is a term which describes se...,The most important reasons are for scientific ...,3603,random,0.521739,0.400000,0.456522,0.456522,0.897356,0.889501,0.893411
1,https://en.wikipedia.org/wiki/Portal:Mathematics,english,Mathematics is the study of representing and r...,Mathematics is used throughout the world as an...,71678,random,0.206573,0.018957,0.122066,0.122066,0.790440,0.831968,0.810673
2,https://simple.wikipedia.org/wiki/History,english,History is the study of past events. A person ...,"The rulers of Medieval Europe, Ancient Rome an...",3659,random,0.283019,0.038462,0.188679,0.188679,0.824692,0.870355,0.846909
3,https://en.wikipedia.org/wiki/Outline_of_machi...,english,The following outline is provided as an overvi...,You can help by adding missing items with reli...,176774,random,0.228070,0.000000,0.087719,0.087719,0.831689,0.847245,0.839395
4,https://en.wikipedia.org/wiki/Category:Astroph...,english,Astrophysics is the branch of astronomy that d...,Astrophysics is the branch of astronomy that d...,2469,random,0.618705,0.554745,0.604317,0.604317,0.954133,0.916818,0.935104
...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,https://www.bbc.com/arabic/tv-and-radio-67957982,arabic,تربط منظمة الصحة العالمية الأخطاء الطبية الأكث...,Keywords : []\n- تربط منظمة الصحة العالمية الأ...,2458,fast,0.666667,0.000000,0.666667,0.666667,0.962029,0.950823,0.956393
773,https://www.bbc.com/arabic/articles/cj3rnmlkp5no,arabic,وتُعدّ أمراض القلب والأوعية الدموية السبب الأو...,Keywords : []\n- صدر الصورة، Getty Images\nالق...,12087,fast,0.285714,0.000000,0.285714,0.285714,0.861637,0.865236,0.863432
774,https://www.aljazeera.com/climate-crisis,arabic,"Six dead, thousands displaced as floods hit so...","Keywords : []\n- Climate Crisis\nSix dead, tho...",1319,fast,0.666667,0.660714,0.666667,0.666667,0.886631,0.976901,0.929580
775,https://www.bbc.com/arabic/articles/c0rgn20w5z2o,arabic,ينذر التغير المناخي بتداعيات قد تغيّر شكل كوكب...,Keywords : []\n- صدر الصورة، Getty Images\nينذ...,11747,fast,0.222222,0.000000,0.222222,0.222222,0.887173,0.924986,0.905685


✅ Saved per-example metrics to '../dataset/metrics_by_example.csv'


In [25]:
# Define which metric columns to average
long_copy = long_df.copy()

# drop arabic and chinese
# long_copy = long_copy[~long_copy["language"].isin(["ar", "zh"])]

metrics = [
    "rouge1", "rouge2", "rougeL", "rougeLsum",
    "bert_precision", "bert_recall", "bert_f1"
]

# 1) Average result per summarizer
avg_by_summarizer = (
    long_copy
    .groupby("summarizer")[metrics]
    .mean()
    .reset_index()
)

# 2) Average result per summarizer per language
avg_by_summarizer_lang = (
    long_copy
    .groupby(["summarizer", "language"])[metrics]
    .mean()
    .sort_values(by=["language"])
    .reset_index()
)

# (Optional) Display or save
print("=== Avg per summarizer ===")
display(avg_by_summarizer)

print("\n=== Avg per summarizer per language ===")
display(avg_by_summarizer_lang)


=== Avg per summarizer ===


Unnamed: 0,summarizer,rouge1,rouge2,rougeL,rougeLsum,bert_precision,bert_recall,bert_f1
0,fast,0.351732,0.237601,0.277552,0.277552,0.854768,0.88127,0.867517
1,lsa,0.360343,0.23173,0.296557,0.296557,0.858334,0.887643,0.872395
2,random,0.279983,0.159767,0.228026,0.228026,0.854923,0.864298,0.859209
3,text_rank,0.328629,0.212571,0.2738,0.2738,0.847339,0.884469,0.86508



=== Avg per summarizer per language ===


Unnamed: 0,summarizer,language,rouge1,rouge2,rougeL,rougeLsum,bert_precision,bert_recall,bert_f1
0,fast,arabic,0.361312,0.201687,0.358372,0.358372,0.896181,0.913422,0.904501
1,random,arabic,0.162884,0.060652,0.149879,0.149879,0.865739,0.861449,0.863246
2,text_rank,arabic,0.217467,0.086294,0.197122,0.197122,0.847097,0.88253,0.863873
3,lsa,arabic,0.220744,0.080777,0.20221,0.20221,0.849971,0.869089,0.858901
4,random,chinese,0.130883,0.076831,0.12493,0.12493,0.864816,0.871434,0.867957
5,lsa,chinese,0.182059,0.109524,0.174122,0.174122,0.85218,0.883592,0.867376
6,text_rank,chinese,0.165059,0.066667,0.156401,0.156401,0.851089,0.883354,0.866825
7,fast,chinese,0.128475,0.067233,0.128475,0.128475,0.829345,0.864974,0.8466
8,lsa,english,0.444315,0.306121,0.366422,0.366422,0.866107,0.896526,0.880721
9,fast,english,0.423898,0.295787,0.325237,0.325237,0.863065,0.885195,0.873688
