# Summarizer Evaluation Notebook

This notebook evaluates different summarization models on a subset of the training dataset.
The evaluation metrics are Faithfulness, Relevance, Coherence, Conciseness, and Language Consistency,
provided by a GPT-based `SummarizationJudge`.

## 1. Imports and Setup

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
import time

# Load environment variables (e.g., OPENAI_API_KEY)
load_dotenv()

# Import summarizers
from summarizers.light import FastSummarizer, SumyTextRankSummarizer, TFIDFSummarizer


# Import the judge
from judge import SummarizationJudge, SummarizationScore

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dortenenboim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


c:\repos\search-summaries\.venv\Lib\site-packages


## 2. Configuration

In [2]:
DATASET_PATH = 'dataset/unified_dataset.csv'
TEXT_COLUMN = 'text'
LANGUAGE_COLUMN = 'language'
MAX_SENTENCES_SUMMARY = 3
N_SAMPLES = 10
TARGET_LANGUAGES = ['english', 'german', 'arabic', 'chinese', 'spanish', 'french']

## 3. Load Data

In [3]:
try:
    df_full = pd.read_csv(DATASET_PATH)
    print(f"Dataset loaded successfully. Shape: {df_full.shape}")
    print(f"Available languages: {df_full[LANGUAGE_COLUMN].unique()}")
except FileNotFoundError:
    print(f"Error: Dataset file not found at {DATASET_PATH}")
    df_full = pd.DataFrame() # Create empty df to avoid further errors

Dataset loaded successfully. Shape: (202, 7)
Available languages: ['english' 'spanish' 'french' 'german' 'chinese' 'arabic']


## 4. Sample Data

We need to select `N_SAMPLES` (20) from the dataset, ensuring at least one sample from each of the `TARGET_LANGUAGES`.

In [4]:
# Ensure all target languages are present in the dataset
available_target_languages = [lang for lang in TARGET_LANGUAGES if lang in df_full[LANGUAGE_COLUMN].unique()]
print(f"Target languages available in dataset: {available_target_languages}")

sampled_dfs = []
# Get at least one sample from each available target language
for lang in available_target_languages:
    lang_sample = df_full[df_full[LANGUAGE_COLUMN] == lang].sample(n=1, random_state=42)
    sampled_dfs.append(lang_sample)

df_sampled_langs = pd.concat(sampled_dfs)

remaining_samples_needed = N_SAMPLES - len(df_sampled_langs)

if remaining_samples_needed > 0:
    # Get remaining samples from the rest of the dataset, excluding already sampled rows
    df_remaining_pool = df_full.drop(df_sampled_langs.index)
    if len(df_remaining_pool) >= remaining_samples_needed:
        df_additional_samples = df_remaining_pool.sample(n=remaining_samples_needed, random_state=42)
        df_eval = pd.concat([df_sampled_langs, df_additional_samples])
    else:
        # If not enough unique samples left, take all available
        df_eval = pd.concat([df_sampled_langs, df_remaining_pool])
        print(f"Warning: Could only sample {len(df_eval)} rows due to data constraints.")
elif N_SAMPLES < len(df_sampled_langs): # If we sampled more than N_SAMPLES because we have many target_languages
    df_eval = df_sampled_langs.sample(n=N_SAMPLES, random_state=42)
else: # Exactly N_SAMPLES were sampled, or N_SAMPLES == len(available_target_languages)
    df_eval = df_sampled_langs

print(f"Selected {len(df_eval)} samples for evaluation.")
print("Language distribution in selected samples:")
print(df_eval[LANGUAGE_COLUMN].value_counts())

Target languages available in dataset: ['english', 'german', 'arabic', 'chinese', 'spanish', 'french']
Selected 10 samples for evaluation.
Language distribution in selected samples:
language
english    3
german     2
spanish    2
arabic     1
chinese    1
french     1
Name: count, dtype: int64


## 5. Initialize Summarizers and Judge

In [5]:
# Initialize summarizers
fast_summarizer = FastSummarizer()
print("HybridSummarizer initialized.")

sumy_summarizer = SumyTextRankSummarizer()

tf_idf_summarizer = TFIDFSummarizer()
summarizers = {
    "fast": fast_summarizer,
    "summy": sumy_summarizer,
    "tf-idf": tf_idf_summarizer,
}

# Initialize the judge
# Assumes OPENAI_API_KEY is in .env or environment variables
try:
    judge = SummarizationJudge()
    print("SummarizationJudge initialized.")
except Exception as e:
    print(f"Error initializing SummarizationJudge: {e}. Make sure OPENAI_API_KEY is set.")
    judge = None

HybridSummarizer initialized.
SummarizationJudge initialized.


## 6. Evaluation Loop

Iterate through each row in the sampled dataset, apply each summarizer, and evaluate the results.

In [7]:
results = []


if not df_eval.empty and judge:
    
    for index, row in df_eval.iterrows():
        original_text = str(row[TEXT_COLUMN])
        language = str(row[LANGUAGE_COLUMN])
        
        print(f"\nProcessing sample from row {row.name} (Language: {language})...") # Using row.name for original index

        if not original_text.strip():
            print(f"Skipping sample from row {row.name} due to empty original text.")
            continue

        for summarizer_name, summarizer_instance in summarizers.items():
            print(f"  Applying {summarizer_name}...")
            try:
                start_time = time.time()
                # Ensure the summarizer's summarize method matches expected signature
                if hasattr(summarizer_instance, 'summarize'):
                    summary = summarizer_instance.summarize(
                        text=original_text, 
                        language=language, 
                        max_sentences=MAX_SENTENCES_SUMMARY
                    )
                else:
                    print(f"    Error: {summarizer_name} does not have a 'summarize' method or signature mismatch.")
                    summary = "Error: Summarizer method issue."
                
                end_time = time.time()
                latency_seconds = end_time - start_time
                print(f"    {summarizer_name} latency: {latency_seconds:.4f} seconds")

                if not summary.strip():
                    print(f"    {summarizer_name} produced an empty summary.")
                    scores = SummarizationScore(faithfulness=1, relevance=1, coherence=1, conciseness=1, language_consistency=0) 
                else:
                    print(f"    Evaluating {summarizer_name} summary...")
                    scores = judge.evaluate_summary(
                        original_markdown=original_text,
                        summary=summary,
                        language=language
                    )
                
                results.append({
                    "sample_original_index": row.name, # Store original index
                    "url": row['url'],
                    "language": language,
                    "summarizer": summarizer_name,
                    "original_text_preview": original_text[:100] + "...", 
                    "original_text_length": len(original_text),
                    "summary": summary,
                    "summary_length": len(summary),
                    "faithfulness": scores.faithfulness,
                    "relevance": scores.relevance,
                    "coherence": scores.coherence,
                    "conciseness": scores.conciseness,
                    "language_consistency": int(scores.language_consistency),
                    "latency_seconds": latency_seconds
                })
                print(f"    Scores for {summarizer_name}: F={scores.faithfulness}, R={scores.relevance}, C={scores.coherence}, Con={scores.conciseness}, LC={scores.language_consistency}")

            except Exception as e:
                print(f"    Error during summarization or evaluation with {summarizer_name} for sample {row.name}: {e}")
                results.append({
                    "sample_original_index": row.name,
                    "language": language,
                    "summarizer": summarizer_name,
                    "original_text_preview": original_text[:100] + "...",
                    "original_text_length": len(original_text),
                    "summary": f"Error: {e}",
                    "faithfulness": 0,
                    "relevance": 0,
                    "coherence": 0,
                    "conciseness": 0,
                    "language_consistency": 0,
                    "latency_seconds": -1
                })
else:
    print("Skipping evaluation loop due to empty dataset or uninitialized judge.")


Processing sample from row 40 (Language: english)...
  Applying fast...
    fast latency: 0.2677 seconds
    Evaluating fast summary...
    Scores for fast: F=3, R=4, C=4, Con=4, LC=True
  Applying summy...
    summy latency: 0.0893 seconds
    Evaluating summy summary...
    Scores for summy: F=4, R=4, C=4, Con=4, LC=True
  Applying tf-idf...
    tf-idf latency: 0.0103 seconds
    Evaluating tf-idf summary...
    Scores for tf-idf: F=1, R=1, C=1, Con=1, LC=True

Processing sample from row 149 (Language: german)...
  Applying fast...
    fast latency: 0.1789 seconds
    Evaluating fast summary...
    Scores for fast: F=4, R=4, C=4, Con=4, LC=True
  Applying summy...
    summy latency: 0.0456 seconds
    Evaluating summy summary...
    Scores for summy: F=4, R=4, C=4, Con=4, LC=True
  Applying tf-idf...
    tf-idf latency: 0.0090 seconds
    Evaluating tf-idf summary...
    Scores for tf-idf: F=1, R=1, C=1, Con=1, LC=True

Processing sample from row 200 (Language: arabic)...
  Applying

## 7. Display Results

In [9]:
df_results = pd.DataFrame(results)
print("\nEvaluation Results:")
display(df_results) 

# %% [markdown]
# ### Average Scores per Summarizer

# %%
# Calculate average scores, ensuring numeric conversion for score columns
score_cols = ['faithfulness', 'relevance', 'coherence', 'conciseness', 'language_consistency', 'latency_seconds']
for col in score_cols:
    df_results[col] = pd.to_numeric(df_results[col], errors='coerce')

avg_scores = df_results.groupby('summarizer')[score_cols].mean()
print("\nAverage Scores per Summarizer:")
display(avg_scores)

# %% [markdown]
# ### Average Scores per Language

# %%
avg_scores_lang = df_results.groupby(['language', 'summarizer'])[score_cols].mean()
print("\nAverage Scores per Language and Summarizer:")
display(avg_scores_lang)

# summary_length per summarizer
avg_summary_length = df_results.groupby('summarizer')['summary_length'].mean()
print("\nAverage Summary Length per Summarizer:")
display(avg_summary_length)
    


Evaluation Results:


Unnamed: 0,sample_original_index,url,language,summarizer,original_text_preview,original_text_length,summary,summary_length,faithfulness,relevance,coherence,conciseness,language_consistency,latency_seconds
0,40,https://www.bbc.com/news/business-57253947,english,fast,Why electric cars will take over sooner than y...,10217,"Keywords : [electric cars, new electric car, T...",1048,3,4,4,4,1,0.267724
1,40,https://www.bbc.com/news/business-57253947,english,summy,Why electric cars will take over sooner than y...,10217,But the sea-change in performance Mr Willson h...,584,4,4,4,4,1,0.089262
2,40,https://www.bbc.com/news/business-57253947,english,tf-idf,Why electric cars will take over sooner than y...,10217,"So did steam engines and printing presses. ""It...",125,1,1,1,1,1,0.010286
3,149,https://www.sueddeutsche.de/politik/cybersiche...,german,fast,Cybersicherheit: Im Visier von Staatshackern u...,7919,"Keywords : [über das die Angreifer, Die Gruppe...",764,4,4,4,4,1,0.178905
4,149,https://www.sueddeutsche.de/politik/cybersiche...,german,summy,Cybersicherheit: Im Visier von Staatshackern u...,7919,Jetzt wollen FBI und Co. es zumindest geschaff...,794,4,4,4,4,1,0.045569
5,149,https://www.sueddeutsche.de/politik/cybersiche...,german,tf-idf,Cybersicherheit: Im Visier von Staatshackern u...,7919,So dramatisch es klingen mag: praktisch jeder....,93,1,1,1,1,1,0.009005
6,200,https://www.alarabiya.net/technology/ai,arabic,fast,أهم وآخر أخبار الذكاء الاصطناعي| العربية\n.\n....,5826,"Keywords : [الاصطناعي منذ, الاصطناعي محل قلقال...",1329,3,3,2,2,1,0.156327
7,200,https://www.alarabiya.net/technology/ai,arabic,summy,أهم وآخر أخبار الذكاء الاصطناعي| العربية\n.\n....,5826,السبت 21 شوال 1446 هـ - 19 أبريل 2025 الحدث en...,5628,3,3,3,2,1,0.030783
8,200,https://www.alarabiya.net/technology/ai,arabic,tf-idf,أهم وآخر أخبار الذكاء الاصطناعي| العربية\n.\n....,5826,السبت 21 شوال 1446 هـ - 19 أبريل 2025 الحدث en...,4215,3,3,3,3,1,0.008437
9,168,http://www.xinhuanet.com/talking/20240904/0599...,chinese,fast,新潮澎湃看中国｜技术创新引领新能源汽车产业不断升级\n\n新潮澎湃看中国｜技术创新引领新能源...,1097,"Keywords : [近四十年深耕 见证新能源汽车从无到有, 新潮澎湃看中国｜技术创新引领...",1162,4,4,4,4,1,0.022407



Average Scores per Summarizer:


Unnamed: 0_level_0,faithfulness,relevance,coherence,conciseness,language_consistency,latency_seconds
summarizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fast,3.4,3.5,3.5,3.4,1.0,0.251515
summy,3.3,3.3,3.3,3.3,1.0,0.059804
tf-idf,2.0,2.0,2.3,2.4,1.0,0.009494



Average Scores per Language and Summarizer:


Unnamed: 0_level_0,Unnamed: 1_level_0,faithfulness,relevance,coherence,conciseness,language_consistency,latency_seconds
language,summarizer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
arabic,fast,3.0,3.0,2.0,2.0,1.0,0.156327
arabic,summy,3.0,3.0,3.0,2.0,1.0,0.030783
arabic,tf-idf,3.0,3.0,3.0,3.0,1.0,0.008437
chinese,fast,4.0,4.0,4.0,4.0,1.0,0.022407
chinese,summy,4.0,4.0,4.0,4.0,1.0,0.007156
chinese,tf-idf,4.0,4.0,4.0,4.0,1.0,0.002012
english,fast,3.333333,3.666667,3.666667,3.666667,1.0,0.226958
english,summy,3.0,2.666667,2.666667,3.0,1.0,0.066505
english,tf-idf,1.666667,1.666667,2.0,2.0,1.0,0.012478
french,fast,3.0,3.0,3.0,3.0,1.0,1.000808



Average Summary Length per Summarizer:


summarizer
fast      1175.8
summy     1707.0
tf-idf     803.4
Name: summary_length, dtype: float64

In [None]:
# save results to CSV
output_path = 'evaluation_results.csv'
df_results.to_csv(output_path, index=False)

PermissionError: [Errno 13] Permission denied: 'evaluation_results.csv'

## End of Evaluation