# Summarizer Evaluation Notebook

This notebook evaluates different summarization models on a subset of the training dataset.
The evaluation metrics are Faithfulness, Relevance, Coherence, Conciseness, and Language Consistency,
provided by a GPT-based `SummarizationJudge`.

## 1. Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

# Load environment variables (e.g., OPENAI_API_KEY)
load_dotenv()

# Import summarizers
from summarizers.light.tf_idf_summarizer import TFIDFSummarizer
from summarizers.light.sumy_summrizer import SumyTextRankSummarizer 
from summarizers.light.text_rank_summarizer import GensimStyleTextRankSummarizer

# Import the judge
from judge import SummarizationJudge, SummarizationScore

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dortenenboim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## 2. Configuration

In [2]:
DATASET_PATH = 'dataset/train_val.csv'
TEXT_COLUMN = 'markdown'
LANGUAGE_COLUMN = 'language'
MAX_SENTENCES_SUMMARY = 3
N_SAMPLES = 20
TARGET_LANGUAGES = ['english', 'german', 'arabic', 'chinese', 'spanish', 'french']

## 3. Load Data

In [3]:
try:
    df_full = pd.read_csv(DATASET_PATH)
    print(f"Dataset loaded successfully. Shape: {df_full.shape}")
    print(f"Available languages: {df_full[LANGUAGE_COLUMN].unique()}")
except FileNotFoundError:
    print(f"Error: Dataset file not found at {DATASET_PATH}")
    df_full = pd.DataFrame() # Create empty df to avoid further errors

Dataset loaded successfully. Shape: (271, 11)
Available languages: ['english' 'german' 'arabic' 'chinese' 'spanish' 'french']


## 4. Sample Data

We need to select `N_SAMPLES` (20) from the dataset, ensuring at least one sample from each of the `TARGET_LANGUAGES`.

In [4]:
if not df_full.empty:
    # Ensure all target languages are present in the dataset
    available_target_languages = [lang for lang in TARGET_LANGUAGES if lang in df_full[LANGUAGE_COLUMN].unique()]
    print(f"Target languages available in dataset: {available_target_languages}")

    sampled_dfs = []
    # Get at least one sample from each available target language
    for lang in available_target_languages:
        lang_sample = df_full[df_full[LANGUAGE_COLUMN] == lang].sample(n=1, random_state=42)
        sampled_dfs.append(lang_sample)

    df_sampled_langs = pd.concat(sampled_dfs)
    
    remaining_samples_needed = N_SAMPLES - len(df_sampled_langs)
    
    if remaining_samples_needed > 0:
        # Get remaining samples from the rest of the dataset, excluding already sampled rows
        df_remaining_pool = df_full.drop(df_sampled_langs.index)
        if len(df_remaining_pool) >= remaining_samples_needed:
            df_additional_samples = df_remaining_pool.sample(n=remaining_samples_needed, random_state=42)
            df_eval = pd.concat([df_sampled_langs, df_additional_samples])
        else:
            # If not enough unique samples left, take all available
            df_eval = pd.concat([df_sampled_langs, df_remaining_pool])
            print(f"Warning: Could only sample {len(df_eval)} rows due to data constraints.")
    elif N_SAMPLES < len(df_sampled_langs): # If we sampled more than N_SAMPLES because we have many target_languages
        df_eval = df_sampled_langs.sample(n=N_SAMPLES, random_state=42)
    else: # Exactly N_SAMPLES were sampled, or N_SAMPLES == len(available_target_languages)
        df_eval = df_sampled_langs

    print(f"Selected {len(df_eval)} samples for evaluation.")
    print("Language distribution in selected samples:")
    print(df_eval[LANGUAGE_COLUMN].value_counts())
else:
    print("Skipping sampling as dataset is empty.")
    df_eval = pd.DataFrame()

Target languages available in dataset: ['english', 'german', 'arabic', 'chinese', 'spanish', 'french']
Selected 20 samples for evaluation.
Language distribution in selected samples:
language
english    8
spanish    4
chinese    3
french     3
german     1
arabic     1
Name: count, dtype: int64


## 5. Initialize Summarizers and Judge

In [5]:
# Initialize summarizers
tfidf_summarizer = TFIDFSummarizer()
sumy_summarizer = SumyTextRankSummarizer()
gensim_summarizer = GensimStyleTextRankSummarizer()

summarizers = {
    "TF-IDF": tfidf_summarizer,
    "SumyTextRank": sumy_summarizer,
    "GensimTextRank": gensim_summarizer
}

# Initialize the judge
# Assumes OPENAI_API_KEY is in .env or environment variables
try:
    judge = SummarizationJudge()
    print("SummarizationJudge initialized.")
except Exception as e:
    print(f"Error initializing SummarizationJudge: {e}. Make sure OPENAI_API_KEY is set.")
    judge = None

SummarizationJudge initialized.


## 6. Evaluation Loop

Iterate through each row in the sampled dataset, apply each summarizer, and evaluate the results.

In [None]:
results = []


if not df_eval.empty and judge:
    
    for index, row in df_eval.iterrows():
        original_text = str(row[TEXT_COLUMN])
        language = str(row[LANGUAGE_COLUMN])
        
        print(f"\nProcessing sample from row {row.name} (Language: {language})...") # Using row.name for original index

        if not original_text.strip():
            print(f"Skipping sample from row {row.name} due to empty original text.")
            continue

        for summarizer_name, summarizer_instance in summarizers.items():
            print(f"  Applying {summarizer_name}...")
            try:
                # Ensure the summarizer's summarize method matches expected signature
                if hasattr(summarizer_instance, 'summarize'):
                    summary = summarizer_instance.summarize(
                        markdown=original_text, 
                        language=language, 
                        max_sentences=MAX_SENTENCES_SUMMARY
                    )
                else:
                    print(f"    Error: {summarizer_name} does not have a 'summarize' method or signature mismatch.")
                    summary = "Error: Summarizer method issue."

                if not summary.strip():
                    print(f"    {summarizer_name} produced an empty summary.")
                    scores = SummarizationScore(faithfulness=1, relevance=1, coherence=1, conciseness=1, language_consistency=0) 
                else:
                    print(f"    Evaluating {summarizer_name} summary...")
                    scores = judge.evaluate_summary(
                        original_markdown=original_text,
                        summary=summary,
                        language=language
                    )
                
                results.append({
                    "sample_original_index": row.name, # Store original index
                    "language": language,
                    "summarizer": summarizer_name,
                    "original_text_preview": original_text[:100] + "...", 
                    "summary": summary,
                    "faithfulness": scores.faithfulness,
                    "relevance": scores.relevance,
                    "coherence": scores.coherence,
                    "conciseness": scores.conciseness,
                    "language_consistency": scores.language_consistency
                })
                print(f"    Scores for {summarizer_name}: F={scores.faithfulness}, R={scores.relevance}, C={scores.coherence}, Con={scores.conciseness}, LC={scores.language_consistency}")

            except Exception as e:
                print(f"    Error during summarization or evaluation with {summarizer_name} for sample {row.name}: {e}")
                results.append({
                    "sample_original_index": row.name,
                    "language": language,
                    "summarizer": summarizer_name,
                    "original_text_preview": original_text[:100] + "...",
                    "summary": f"Error: {e}",
                    "faithfulness": 0,
                    "relevance": 0,
                    "coherence": 0,
                    "conciseness": 0,
                    "language_consistency": 0
                })
else:
    print("Skipping evaluation loop due to empty dataset or uninitialized judge.")


Processing sample from row 96 (Language: english)...
  Applying TF-IDF...
    Evaluating TF-IDF summary...
    Scores for TF-IDF: F=1, R=1, C=1, Con=1, LC=1
  Applying SumyTextRank...
    Evaluating SumyTextRank summary...
    Scores for SumyTextRank: F=2, R=2, C=2, Con=2, LC=1
  Applying GensimTextRank...
    Evaluating GensimTextRank summary...
    Scores for GensimTextRank: F=2, R=3, C=2, Con=2, LC=1

Processing sample from row 99 (Language: german)...
  Applying TF-IDF...
    Evaluating TF-IDF summary...
    Scores for TF-IDF: F=1, R=1, C=1, Con=1, LC=1
  Applying SumyTextRank...
    Evaluating SumyTextRank summary...
    Scores for SumyTextRank: F=2, R=2, C=2, Con=2, LC=1
  Applying GensimTextRank...
    Evaluating GensimTextRank summary...
    Scores for GensimTextRank: F=1, R=1, C=2, Con=2, LC=1

Processing sample from row 147 (Language: arabic)...
  Applying TF-IDF...
    Evaluating TF-IDF summary...
    Scores for TF-IDF: F=2, R=2, C=2, Con=2, LC=1
  Applying SumyTextRank...


## 7. Display Results

In [7]:
if results:
    df_results = pd.DataFrame(results)
    print("\nEvaluation Results:")
    display(df_results) 

    # %% [markdown]
    # ### Average Scores per Summarizer

    # %%
    # Calculate average scores, ensuring numeric conversion for score columns
    score_cols = ['faithfulness', 'relevance', 'coherence', 'conciseness', 'language_consistency']
    for col in score_cols:
        df_results[col] = pd.to_numeric(df_results[col], errors='coerce')

    avg_scores = df_results.groupby('summarizer')[score_cols].mean()
    print("\nAverage Scores per Summarizer:")
    display(avg_scores)

    # %% [markdown]
    # ### Average Scores per Language

    # %%
    avg_scores_lang = df_results.groupby(['language', 'summarizer'])[score_cols].mean()
    print("\nAverage Scores per Language and Summarizer:")
    display(avg_scores_lang)
    
else:
    print("No results to display.")


Evaluation Results:


Unnamed: 0,sample_original_index,language,summarizer,original_text_preview,summary,faithfulness,relevance,coherence,conciseness,language_consistency
0,96,english,TF-IDF,![](/static/images/icons/wikipedia.png)\n![Wik...,So I advocated that we not put anything in. It...,1,1,1,1,1
1,96,english,SumyTextRank,![](/static/images/icons/wikipedia.png)\n![Wik...,COSMOS was launched in 2006 as the largest pro...,2,2,2,2,1
2,96,english,GensimTextRank,![](/static/images/icons/wikipedia.png)\n![Wik...,| | | Names | HST Hubble | | | | | Mission typ...,2,3,2,2,1
3,99,german,TF-IDF,![](https://en.wikipedia.org/static/images/don...,Geometry (englisch für ) steht für: Musikalben...,1,1,1,1,1
4,99,german,SumyTextRank,![](https://en.wikipedia.org/static/images/don...,Geometry (englisch für ) steht für: Musikalben...,2,2,2,2,1
5,99,german,GensimTextRank,![](https://en.wikipedia.org/static/images/don...,Geometry (englisch für ) steht für: Musikalben...,1,1,2,2,1
6,147,arabic,TF-IDF,![](https://en.wikipedia.org/static/images/don...,، عبر الحد من إنبعاث في الغلاف الجوي في الأعوا...,2,2,2,2,1
7,147,arabic,SumyTextRank,![](https://en.wikipedia.org/static/images/don...,Error: arabic tokenizer requires pyarabic. Ple...,0,0,0,0,0
8,147,arabic,GensimTextRank,![](https://en.wikipedia.org/static/images/don...,وتستخدم تقنيات توليد الطاقة التي تعتمد على الر...,3,3,4,3,1
9,90,chinese,TF-IDF,![](/static/images/icons/wikipedia.png)\n![维基百...,,1,1,1,1,0



Average Scores per Summarizer:


Unnamed: 0_level_0,faithfulness,relevance,coherence,conciseness,language_consistency
summarizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GensimTextRank,1.9,2.1,2.5,2.35,0.85
SumyTextRank,1.45,1.55,1.65,1.55,0.8
TF-IDF,1.6,1.6,1.7,1.8,0.85



Average Scores per Language and Summarizer:


Unnamed: 0_level_0,Unnamed: 1_level_0,faithfulness,relevance,coherence,conciseness,language_consistency
language,summarizer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arabic,GensimTextRank,3.0,3.0,4.0,3.0,1.0
arabic,SumyTextRank,0.0,0.0,0.0,0.0,0.0
arabic,TF-IDF,2.0,2.0,2.0,2.0,1.0
chinese,GensimTextRank,2.0,2.333333,2.333333,2.0,0.666667
chinese,SumyTextRank,0.0,0.0,0.0,0.0,0.0
chinese,TF-IDF,2.333333,2.333333,2.333333,2.333333,0.333333
english,GensimTextRank,1.875,2.125,2.625,2.5,1.0
english,SumyTextRank,1.625,1.75,2.0,1.75,1.0
english,TF-IDF,1.375,1.375,1.5,1.75,1.0
french,GensimTextRank,2.0,2.333333,2.666667,2.666667,0.666667


## End of Evaluation