In [80]:
import sys
sys.path.append("..")

In [81]:
import pandas as pd
from IPython.display import display
import evaluate

In [82]:
df = pd.read_csv("../dataset/results.csv")
df.columns

Index(['url', 'raw_markdown', 'language', 'domain', 'source', 'length',
       'word_count', 'cleaned_markdown', 'summary_gt', 'summary_gt_extractive',
       'summary_random', 'summary_lsa', 'summary_text_rank',
       'summary_spacy_textrank', 'summary_sentence_transformer_mpnet',
       'summary_sentence_transformer_distiluse', 'summary_claude_haiku_3_5',
       'summary_claude_sonnet_4', 'summary_random_latency',
       'summary_lsa_latency', 'summary_text_rank_latency',
       'summary_spacy_textrank_latency',
       'summary_sentence_transformer_mpnet_latency',
       'summary_sentence_transformer_distiluse_latency',
       'summary_claude_haiku_3_5_latency', 'summary_claude_sonnet_4_latency'],
      dtype='object')

In [83]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [84]:
# 3. Preprocessing function
def preprocess(text: str) -> str:
    return " ".join(text.lower().split())

In [85]:
# Cell 3: Compute per-output metrics
summarizers = ['random', 'lsa', 'text_rank', 'spacy_textrank', "sentence_transformer_mpnet", 
               "sentence_transformer_distiluse", "claude_haiku_3_5", "claude_sonnet_4"]


records = []

for name in summarizers:
    print(f"Processing summarizer: {name}")
    pred_col = f"summary_{name}"
    
    # gt_col = "summary_gt"
    gt_col = "summary_gt_extractive"
    
    # Drop NaNs for evaluation
    eval_df = df.dropna(subset=[pred_col, gt_col])
    refs  = eval_df[gt_col].map(preprocess).tolist()
    preds = eval_df[pred_col].map(preprocess).tolist()
    
    # Compute BERTScore once per summarizer
    bert_scores = bertscore.compute(
        predictions=preds,
        references=refs,
        model_type="xlm-roberta-base",
        idf=False,
        batch_size=16,
    )
    
    # Now record metrics for each example
    for i, idx in enumerate(eval_df.index):
        # ROUGE on the single example
        rouge_score = rouge.compute(
            predictions=[preds[i]],
            references=[refs[i]],
            use_stemmer=True
        )
        
        records.append({
            "url":             df.loc[idx, "url"],
            "language":        df.loc[idx, "language"],
            "gt":              df.loc[idx, gt_col],
            "summary_pred":    df.loc[idx, pred_col],
            "length":          df.loc[idx, "length"],
            "summarizer":      name,
            "rouge1":          rouge_score["rouge1"],
            "rouge2":          rouge_score["rouge2"],
            "rougeL":          rouge_score["rougeL"],
            "rougeLsum":       rouge_score["rougeLsum"],
            "bert_precision":  bert_scores["precision"][i],
            "bert_recall":     bert_scores["recall"][i],
            "bert_f1":         bert_scores["f1"][i],
        })


Processing summarizer: random
Processing summarizer: lsa
Processing summarizer: text_rank
Processing summarizer: spacy_textrank
Processing summarizer: sentence_transformer_mpnet
Processing summarizer: sentence_transformer_distiluse
Processing summarizer: claude_haiku_3_5
Processing summarizer: claude_sonnet_4


In [86]:
# Build the per-example metrics DataFrame
long_df = pd.DataFrame.from_records(records)

In [87]:
# Cell 4: Save and display
long_df.to_csv("../dataset/metrics_by_example.csv", index=False)
display("Per-example evaluation metrics", long_df)
print("✅ Saved per-example metrics to '../dataset/metrics_by_example.csv'")

'Per-example evaluation metrics'

Unnamed: 0,url,language,gt,summary_pred,length,summarizer,rouge1,rouge2,rougeL,rougeLsum,bert_precision,bert_recall,bert_f1
0,https://simple.wikipedia.org/wiki/Space_explor...,english,Space exploration is a term which describes se...,There are many reasons for space exploration.\...,3603,random,0.261905,0.024390,0.119048,0.119048,0.863439,0.857694,0.860557
1,https://en.wikipedia.org/wiki/Portal:Mathematics,english,Mathematics is the study of representing and r...,Portal topics - (Random portal) Mathematics is...,71678,random,0.422360,0.238994,0.322981,0.322981,0.875098,0.875510,0.875304
2,https://simple.wikipedia.org/wiki/History,english,History is the study of past events. A person ...,History is the study of past events.\nA person...,3659,random,0.452381,0.268293,0.404762,0.404762,0.871200,0.938371,0.903539
3,https://en.wikipedia.org/wiki/Outline_of_machi...,english,The following outline is provided as an overvi...,[3] These algorithms operate by building a mod...,176774,random,0.341880,0.034783,0.153846,0.153846,0.834351,0.861427,0.847673
4,https://en.wikipedia.org/wiki/Category:Astroph...,english,Astrophysics is the branch of astronomy that d...,Astrophysics is the branch of astronomy that d...,2469,random,0.938547,0.937853,0.938547,0.938547,0.981122,0.998952,0.989957
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1564,https://www.bbc.com/arabic/tv-and-radio-67957982,arabic,تربط منظمة الصحة العالمية الأخطاء الطبية الأكث...,تربط منظمة الصحة العالمية الأخطاء الطبية الأكث...,2458,claude_sonnet_4,0.000000,0.000000,0.000000,0.000000,0.966060,0.846090,0.902103
1565,https://www.bbc.com/arabic/articles/cj3rnmlkp5no,arabic,وتُعدّ أمراض القلب والأوعية الدموية السبب الأو...,وتُعدّ أمراض القلب والأوعية الدموية السبب الأو...,12087,claude_sonnet_4,0.500000,0.000000,0.500000,0.500000,0.969366,0.851170,0.906431
1566,https://www.aljazeera.com/climate-crisis,arabic,"Six dead, thousands displaced as floods hit so...",'Exceptionally large floods' swept through Gui...,1319,claude_sonnet_4,0.865672,0.861538,0.865672,0.865672,0.976410,0.945058,0.960478
1567,https://www.bbc.com/arabic/articles/c0rgn20w5z2o,arabic,ينذر التغير المناخي بتداعيات قد تغيّر شكل كوكب...,ينذر التغير المناخي بتداعيات قد تغيّر شكل كوكب...,11747,claude_sonnet_4,0.000000,0.000000,0.000000,0.000000,0.983837,0.870040,0.923446


✅ Saved per-example metrics to '../dataset/metrics_by_example.csv'


In [94]:
# Define which metric columns to average
long_copy = long_df.copy()

# drop arabic and chinese
# long_copy = long_copy[~long_copy["language"].isin(["ar", "zh"])]

metrics = [
    "rouge1", "rouge2", "rougeL", "rougeLsum",
    "bert_precision", "bert_recall", "bert_f1"
]

# 1) Average result per summarizer
avg_by_summarizer = (
    long_copy
    .groupby("summarizer")[metrics]
    .mean()
    .reset_index()
)

# 2) Average result per summarizer per language
avg_by_summarizer_lang = (
    long_copy
    .groupby(["summarizer", "language"])[metrics]
    .mean()
    .sort_values(by=["language"])
    .reset_index()
)

# (Optional) Display or save
print("=== Avg per summarizer ===")
display(avg_by_summarizer.sort_values(by="rougeLsum", ascending=False))

print("\n=== Avg per summarizer per language ===")
display(avg_by_summarizer_lang)


=== Avg per summarizer ===


Unnamed: 0,summarizer,rouge1,rouge2,rougeL,rougeLsum,bert_precision,bert_recall,bert_f1
1,claude_sonnet_4,0.576508,0.512536,0.551489,0.551489,0.940695,0.911053,0.925308
0,claude_haiku_3_5,0.503792,0.420164,0.468924,0.468924,0.926781,0.902579,0.914246
5,sentence_transformer_mpnet,0.42348,0.324303,0.383698,0.383698,0.881709,0.89945,0.889948
4,sentence_transformer_distiluse,0.422092,0.317798,0.379869,0.379869,0.877966,0.899668,0.888166
6,spacy_textrank,0.375064,0.255934,0.297274,0.297274,0.865142,0.88422,0.874209
2,lsa,0.355671,0.226742,0.29159,0.29159,0.857639,0.886578,0.871518
7,text_rank,0.328629,0.212571,0.2738,0.2738,0.847339,0.884469,0.86508
3,random,0.278959,0.147063,0.221238,0.221238,0.856994,0.864125,0.860211



=== Avg per summarizer per language ===


Unnamed: 0,summarizer,language,rouge1,rouge2,rougeL,rougeLsum,bert_precision,bert_recall,bert_f1
0,claude_haiku_3_5,arabic,0.417349,0.285136,0.416288,0.416288,0.956136,0.891755,0.922375
1,sentence_transformer_distiluse,arabic,0.359172,0.1545,0.352418,0.352418,0.900554,0.926784,0.913017
2,lsa,arabic,0.220744,0.080777,0.20221,0.20221,0.850056,0.869241,0.859017
3,text_rank,arabic,0.217467,0.086294,0.197122,0.197122,0.847097,0.88253,0.863873
4,claude_sonnet_4,arabic,0.309403,0.223218,0.309403,0.309403,0.939056,0.875484,0.905526
5,sentence_transformer_mpnet,arabic,0.308136,0.188822,0.307181,0.307181,0.903485,0.926702,0.914523
6,spacy_textrank,arabic,0.426925,0.21057,0.423967,0.423967,0.914309,0.918727,0.916239
7,random,arabic,0.247815,0.100393,0.23513,0.23513,0.876768,0.851894,0.863857
8,claude_sonnet_4,chinese,0.271154,0.202733,0.271154,0.271154,0.91429,0.871221,0.891857
9,sentence_transformer_mpnet,chinese,0.209314,0.117125,0.200264,0.200264,0.866315,0.880316,0.872864


In [93]:
# for each summarizer, print average latency
for summarizer in summarizers:
    avg_latency = df[f"summary_{summarizer}_latency"].mean()
    print(f"Average latency for {summarizer}: {avg_latency:.2f} seconds")

Average latency for random: 0.01 seconds
Average latency for lsa: 0.04 seconds
Average latency for text_rank: 0.03 seconds
Average latency for spacy_textrank: 0.20 seconds
Average latency for sentence_transformer_mpnet: 4.16 seconds
Average latency for sentence_transformer_distiluse: 0.25 seconds
Average latency for claude_haiku_3_5: 2.47 seconds
Average latency for claude_sonnet_4: 3.55 seconds


In [91]:
df.columns

Index(['url', 'raw_markdown', 'language', 'domain', 'source', 'length',
       'word_count', 'cleaned_markdown', 'summary_gt', 'summary_gt_extractive',
       'summary_random', 'summary_lsa', 'summary_text_rank',
       'summary_spacy_textrank', 'summary_sentence_transformer_mpnet',
       'summary_sentence_transformer_distiluse', 'summary_claude_haiku_3_5',
       'summary_claude_sonnet_4', 'summary_random_latency',
       'summary_lsa_latency', 'summary_text_rank_latency',
       'summary_spacy_textrank_latency',
       'summary_sentence_transformer_mpnet_latency',
       'summary_sentence_transformer_distiluse_latency',
       'summary_claude_haiku_3_5_latency', 'summary_claude_sonnet_4_latency'],
      dtype='object')