In [1]:
# =========================
# BBC News — cleaned HF dataset (capped at 2200 rows)
# =========================
# Preprocessing:
# - Unicode normalization & basic cleaning
# - Lowercasing
# - Duplicate removal (exact duplicates on article+highlights)
# - Tokenization (NLTK word_tokenize)
# - Stopword removal (NLTK)
# - Lemmatization (WordNet, POS-aware)
# Columns preserved: 'article', 'highlights'
# Extra columns (for inspection): 'article_tokens', 'highlights_tokens'
# =========================

import os
import re
import unicodedata
import warnings
import pandas as pd
from tqdm.auto import tqdm

# HuggingFace datasets
from datasets import Dataset, DatasetDict

# NLTK setup
import nltk
for pkg in [
    "punkt", "stopwords", "wordnet", "omw-1.4",
    "averaged_perceptron_tagger", "averaged_perceptron_tagger_eng"
]:
    try:
        nltk.download(pkg, quiet=True)
    except Exception as e:
        warnings.warn(f"NLTK download warning for {pkg}: {e}")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# -----------------------------
# Paths / I/O
# -----------------------------
CSV_PATH = "/kaggle/input/bbc-news-dataset/bbc_news_summary.csv"

# -----------------------------
# Helpers
# -----------------------------
_STOPWORDS = set(stopwords.words("english"))
_LEMMA = WordNetLemmatizer()

def _to_wordnet_pos(tag: str):
    if tag.startswith('J'): return 'a'
    if tag.startswith('V'): return 'v'
    if tag.startswith('N'): return 'n'
    if tag.startswith('R'): return 'r'
    return 'n'

def _normalize_unicode(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200b", " ").replace("\ufeff", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s*\n\s*", "\n", s)
    return s.strip()

def _basic_clean(s: str) -> str:
    s = s.lower()
    s = (s.replace("“", '"').replace("”", '"')
           .replace("’", "'").replace("‘", "'")
           .replace("—", "-").replace("–", "-"))
    # drop control chars except newlines
    s = "".join(ch for ch in s if ch == "\n" or unicodedata.category(ch)[0] != "C")
    return s.strip()

def _tokenize_lemmatize_stop(text: str):
    tokens = word_tokenize(text)
    # Robust POS tag fallback
    try:
        tagged = pos_tag(tokens, lang="eng")
    except LookupError:
        try:
            nltk.download("averaged_perceptron_tagger_eng", quiet=True)
            tagged = pos_tag(tokens, lang="eng")
        except Exception:
            nltk.download("averaged_perceptron_tagger", quiet=True)
            tagged = pos_tag(tokens)

    cleaned_tokens = []
    for tok, tg in tagged:
        # keep only alnum-ish tokens
        if not re.search(r"[A-Za-z0-9]", tok):
            continue
        if tok in _STOPWORDS:
            continue
        wn_pos = _to_wordnet_pos(tg)
        lemma = _LEMMA.lemmatize(tok, pos=wn_pos)
        cleaned_tokens.append(lemma)

    cleaned_text = " ".join(cleaned_tokens)
    return cleaned_text, cleaned_tokens

def preprocess_text(text: str):
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = _normalize_unicode(text)
    text = _basic_clean(text)
    cleaned_text, tokens = _tokenize_lemmatize_stop(text)
    # if aggressive cleaning empties text, fall back to lightly cleaned text
    if not cleaned_text:
        cleaned_text = text
        tokens = word_tokenize(text) if text else []
    return cleaned_text, tokens

# -----------------------------
# Load & validate
# -----------------------------
df = pd.read_csv(CSV_PATH)

# Fix column names if needed
df = df.rename(columns={"Articles": "article", "Summaries": "highlights"})
expected_cols = {"article", "highlights"}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"CSV missing required columns {missing}. Found: {list(df.columns)}")

# -----------------------------
# Clean & filter (pre-limit)
# -----------------------------
for col in ["article", "highlights"]:
    df[col] = df[col].astype(str).map(lambda s: " ".join(str(s).split()))

df.dropna(subset=["article", "highlights"], inplace=True)
df = df[df["article"].str.len() > 30]
df = df[df["highlights"].str.len() > 10]

# Drop exact duplicates
before = len(df)
df = df.drop_duplicates(subset=["article", "highlights"]).reset_index(drop=True)
after = len(df)
print(f"[INFO] Dropped {before - after} duplicate rows.")
print(f"[INFO] Rows after basic cleaning & dedup: {len(df)}")

# -----------------------------
# Hard cap at 2200 rows (AFTER cleaning, BEFORE preprocessing)
# -----------------------------
TARGET_N = 2200
n_available = len(df)

if n_available >= TARGET_N:
    df = df.sample(n=TARGET_N, random_state=42).reset_index(drop=True)
else:
    # shuffle all if fewer than target
    df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

print(f"[INFO] Using {len(df)} rows for preprocessing (target {TARGET_N}).")

# -----------------------------
# Preprocess
# -----------------------------
articles_clean, articles_tokens = [], []
highlights_clean, highlights_tokens = [], []

print("[INFO] Preprocessing texts...")
for a, h in tqdm(zip(df["article"].astype(str), df["highlights"].astype(str)),
                 total=len(df), leave=False):
    a_clean, a_tok = preprocess_text(a)
    h_clean, h_tok = preprocess_text(h)
    articles_clean.append(a_clean)
    articles_tokens.append(a_tok)
    highlights_clean.append(h_clean)
    highlights_tokens.append(h_tok)

df["article"] = articles_clean
df["highlights"] = highlights_clean
df["article_tokens"] = articles_tokens
df["highlights_tokens"] = highlights_tokens

# Ensure non-empty after preprocessing
df = df[(df["article"].str.len() > 0) & (df["highlights"].str.len() > 0)].reset_index(drop=True)
print(f"[INFO] Final row count after preprocessing: {len(df)}")

# -----------------------------
# HuggingFace Dataset
# -----------------------------
hf_all = Dataset.from_pandas(df)
split = hf_all.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({"train": split["train"], "test": split["test"]})

print(dataset)
print(f"Features: {dataset['train'].column_names}")
print("=" * 100)

# -----------------------------
# Show samples
# -----------------------------
for i in range(min(3, len(dataset["train"]))):
    sample = dataset["train"][i]
    art = sample["article"]; summ = sample["highlights"]
    print(f"Sample {i+1}")
    print("-" * 100)
    print(f"Article (len {len(art)}):\n{art[:1500]}{'...' if len(art) > 1500 else ''}")
    print("-" * 100)
    print(f"Summary (len {len(summ)}):\n{summ}")
    print("-" * 100)
    print(f"Article tokens preview: {sample.get('article_tokens', [])[:30]}")
    print(f"Summary tokens preview: {sample.get('highlights_tokens', [])[:30]}")
    print("=" * 100)


[INFO] Dropped 98 duplicate rows.
[INFO] Rows after basic cleaning & dedup: 2126
[INFO] Using 2126 rows for preprocessing (target 2200).
[INFO] Preprocessing texts...


  0%|          | 0/2126 [00:00<?, ?it/s]

[INFO] Final row count after preprocessing: 2126
DatasetDict({
    train: Dataset({
        features: ['File_path', 'article', 'highlights', 'article_tokens', 'highlights_tokens'],
        num_rows: 1700
    })
    test: Dataset({
        features: ['File_path', 'article', 'highlights', 'article_tokens', 'highlights_tokens'],
        num_rows: 426
    })
})
Features: ['File_path', 'article', 'highlights', 'article_tokens', 'highlights_tokens']
Sample 1
----------------------------------------------------------------------------------------------------
Article (len 907):
ad firm wpp 's profit surge 15 uk advertising giant wpp post larger-than-expected annual profit predict outperform market 2005 pre-tax profit rise 15 year ago reach £546m 1.04bn ahead average analyst forecast £532m revenue £4.3bn firm 's operating margin 14.1 say could reach 14.8 2006. year wpp buy u rival grey global create giant big enough rival sector leader omnicom chief executive martin sorrell friday tell reuters 

In [2]:
# # =========================
# # CNN/DailyMail — 2,200 rows with Train/Val/Test (80/10/10)
# # =========================

# import pandas as pd
# from datasets import Dataset, DatasetDict

# # --- Load CSV ---
# df = pd.read_csv("/kaggle/input/cnn-dataser/cnn_dailymail_summary.csv")

# # If your file has 'Articles' and 'Summaries', rename to expected schema
# df = df.rename(columns={"Articles": "article", "Summaries": "highlights"})

# # --- Basic filtering ---
# df.dropna(subset=["article", "highlights"], inplace=True)
# df = df[df["article"].astype(str).str.len() > 30]
# df = df[df["highlights"].astype(str).str.len() > 10]

# # --- Select exactly 2,200 rows (reproducible) ---
# if len(df) < 2200:
#     raise ValueError(f"Dataset has {len(df)} usable rows after filtering; need at least 2200.")
# df = df.sample(n=2200, random_state=42).reset_index(drop=True)

# # --- Build Hugging Face dataset & 80/10/10 split ---
# hf_all = Dataset.from_pandas(df.reset_index(drop=True))
# tmp = hf_all.train_test_split(test_size=0.20, seed=42)            # 80% train, 20% temp
# val_test = tmp["test"].train_test_split(test_size=0.50, seed=42)  # split temp -> 10% val, 10% test

# dataset = DatasetDict({
#     "train": tmp["train"],
#     "validation": val_test["train"],
#     "test": val_test["test"]
# })

# # --- Inspect sizes and features ---
# print(dataset)
# for split in ["train", "validation", "test"]:
#     print(f"{split}: {len(dataset[split])} rows")
# print("Features:", dataset["train"].column_names)
# print("=" * 100)

# # --- Peek a few training samples ---
# for i in range(min(3, len(dataset["train"]))):
#     s = dataset["train"][i]
#     article = str(s["article"])
#     print(f"Sample {i+1}")
#     print("-" * 80)
#     print(f"Article ({len(article)} chars):\n{article[:1500]}{'...' if len(article) > 1500 else ''}")
#     print("-" * 80)
#     print(f"Summary:\n{s['highlights']}")
#     print("=" * 100)


In [3]:
# Extract a sample article from your custom dataset for testing generation
sample_text = dataset['test'][1]['article'][:5000]  # use test set to simulate inference
print("Sample article:\n")
print(sample_text)
print("\n" + "="*100 + "\n")

# Initialize an empty dictionary to store model summaries
summaries = {}


Sample article:

broadband uk gather pace one person uk join internet 's fast lane every 10 second accord bt telecom giant say number people broadband via telephone line surpass four million include connect via cable almost six million people fast always-on connection boom fuel fierce competition fall price well great availability broadband phone line take-up rate broadband accelerate terrific pace say ben verwaayen bt 's chief executive strong position hit five million target summer 2006 much early previously expect last million connection make past four month thousand people add total every day week sign broadband include get service direct bt via many company re-sell bt line name part surge people sign due bt stretch reach adsl uk 's widely used way get broadband beyond six kilometre asymmetric digital subscriber line technology let ordinary copper phone line support high data speed standard speed 512kbps though fast connection available accord bt 95 uk home business receive broadba

In [4]:
import nltk 
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## *Extract the first three sentences as a summary*

In [5]:
# Define a baseline summarizer: first 3 sentences of the article
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

# Generate baseline summary for your sample article
summaries["baseline"] = three_sentence_summary(sample_text)

# Show result
print("=== BASELINE SUMMARY ===\n")
print(summaries["baseline"])
print("\n" + "="*100 + "\n")


=== BASELINE SUMMARY ===

broadband uk gather pace one person uk join internet 's fast lane every 10 second accord bt telecom giant say number people broadband via telephone line surpass four million include connect via cable almost six million people fast always-on connection boom fuel fierce competition fall price well great availability broadband phone line take-up rate broadband accelerate terrific pace say ben verwaayen bt 's chief executive strong position hit five million target summer 2006 much early previously expect last million connection make past four month thousand people add total every day week sign broadband include get service direct bt via many company re-sell bt line name part surge people sign due bt stretch reach adsl uk 's widely used way get broadband beyond six kilometre asymmetric digital subscriber line technology let ordinary copper phone line support high data speed standard speed 512kbps though fast connection available accord bt 95 uk home business receiv

**    XLNET**

In [6]:
# ---------------- XLNet ----------------
from transformers import pipeline
xlnet_pipe = pipeline("text-generation", model="xlnet-base-cased", device=0)
summaries["xlnet"] = xlnet_pipe(sample_text,
    max_length=256,
    batch_size=128,
    top_p=0.9,
    repetition_penalty=1.2
)[0]["generated_text"]


2025-09-15 17:44:23.460839: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757958263.716458      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757958263.794023      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Device set to use cpu


model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=421) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (-1). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


## GPT-2

In [7]:
from transformers import pipeline, set_seed
set_seed(42)
pipe = pipeline("text-generation", model="gpt2-xl")
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
summaries["gpt2"] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query):]))

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


## T-5

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from nltk.tokenize import sent_tokenize
import torch, nltk

# ensure sentence splitter
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

device_id = 0 if torch.cuda.is_available() else -1
model_id = "t5-large"

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id)

t5_pipe = pipeline("summarization", model=mdl, tokenizer=tok, device=device_id)

# --- helper for long inputs (T5 encoder ~512 tokens) ---
def summarize_with_t5_map_reduce(
    text,
    pipe=t5_pipe,
    tok=tok,
    max_input_tokens=512,     # T5-large encoder window
    chunk_buffer=32,          # keep some room for special tokens
    stride_tokens=128,        # overlap to preserve context
    num_beams=4,
    min_length=30,
    max_length=70,
    no_repeat_ngram_size=2,
    length_penalty=1.0,
):
    # encode once (ids length ~= tokens)
    ids = tok.encode(text, add_special_tokens=False)
    chunk_len = max_input_tokens - chunk_buffer

    gen_kwargs = dict(
        num_beams=num_beams,
        min_length=min_length,
        max_length=max_length,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        truncation=True,
        clean_up_tokenization_spaces=True,
    )

    # short enough: single pass
    if len(ids) <= chunk_len:
        return pipe(text, **gen_kwargs)[0]["summary_text"].strip()

    # map-reduce: chunk -> summarize parts -> summarize the join
    chunks, i = [], 0
    while i < len(ids):
        window = ids[i:i+chunk_len]
        chunks.append(tok.decode(window, skip_special_tokens=True))
        if i + chunk_len >= len(ids):
            break
        i += max(1, chunk_len - stride_tokens)

    partials = [pipe(part, **gen_kwargs)[0]["summary_text"].strip() for part in chunks]
    merged = " ".join(partials)
    final = pipe(merged, **gen_kwargs)[0]["summary_text"].strip()
    return final

# Use it
# sample_text = dataset["test"][0]["article"]
t5_summary = summarize_with_t5_map_reduce(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(t5_summary))


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Both `max_new_tokens` (=256) and `max_length`(=70) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


**BigBird**

In [9]:
# BigBird-Pegasus long-doc summarization (drop-in)
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from nltk.tokenize import sent_tokenize

# 1) Pick a BigBird-Pegasus checkpoint
#   Common long-doc options: "google/bigbird-pegasus-large-arxiv", "google/bigbird-pegasus-large-pubmed"
#   For general/news, arXiv works surprisingly well in practice for long inputs:
bbp_model_id = "google/bigbird-pegasus-large-arxiv"

# 2) Load model + tokenizer
tokenizer_bbp = AutoTokenizer.from_pretrained(bbp_model_id, use_fast=True)
model_bbp = AutoModelForSeq2SeqLM.from_pretrained(bbp_model_id)

# 3) HF pipeline (GPU = 0 like your T5 example)
bbp_pipe = pipeline("summarization", model=model_bbp, tokenizer=tokenizer_bbp, device=0)

# 4) Helper: chunk long texts safely and do map-reduce summarization
def summarize_long_with_bbp(
    text: str,
    pipe,
    tokenizer,
    max_input_tokens: int = 4096,   # BigBird-Pegasus context
    chunk_buffer: int = 64,         # reserve for special tokens
    stride_tokens: int = 256,       # overlap to preserve context
    reduce_once: bool = True,
    **gen_kwargs
) -> str:
    """
    Splits very long text into token windows, summarizes each, then (optionally) summarizes
    the concatenated chunk summaries once more for a concise final output.
    """
    # sensible defaults for news summaries; tweak as desired
    if not gen_kwargs:
        gen_kwargs = dict(
            num_beams=4,
            length_penalty=1.0,
            no_repeat_ngram_size=3,
            min_length=80,
            max_length=220
        )

    # Encode once to tokens so we chunk by tokens (not characters)
    ids = tokenizer.encode(text, add_special_tokens=False)
    chunk_len = max_input_tokens - chunk_buffer
    if chunk_len <= 0:
        raise ValueError("chunk_len <= 0; lower chunk_buffer or raise max_input_tokens.")

    # If short enough, just summarize directly
    if len(ids) <= chunk_len:
        out = pipe(text, **gen_kwargs)[0]["summary_text"]
        return out.strip()

    # Otherwise, sliding windows with stride
    chunks = []
    i = 0
    while i < len(ids):
        window = ids[i : i + chunk_len]
        chunks.append(tokenizer.decode(window, skip_special_tokens=True))
        if i + chunk_len >= len(ids):
            break
        i += max(1, chunk_len - stride_tokens)

    # Summarize each chunk
    part_summaries = []
    for part in chunks:
        s = pipe(part, **gen_kwargs)[0]["summary_text"]
        part_summaries.append(s.strip())

    if not reduce_once:
        # Return concatenated parts if you want the long form
        return "\n".join(part_summaries)

    # Reduce step: summarize the concatenated chunk summaries
    joined = " ".join(part_summaries)
    final = pipe(joined, **gen_kwargs)[0]["summary_text"]
    return final.strip()

# 5) Use it on your sample_text and store in your summaries dict
pipe_output_bbp = summarize_long_with_bbp(sample_text, bbp_pipe, tokenizer_bbp)
summaries["bigbird_pegasus"] = "\n".join(sent_tokenize(pipe_output_bbp))

print("BigBird-Pegasus summary:\n", summaries["bigbird_pegasus"][:500], "...")


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 220, but your input_length is only 192. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Attention type 'block_sparse' is not possible if sequence_length: 192 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


BigBird-Pegasus summary:
 the number of cable broadband customers in the united kingdom has grown to 1.7 million by end of 2006 , an increase of a factor of three compared to the same period a year ago .<n> cable broadband market penetration in the uk is one of the highest in the world , followed by the arab world .
in the first quarter of 2007 , the company s revenue and profit increased more than twofold and three times , respectively , over the first three months of the year , as reported by the company in its annual  ...


## BART

In [10]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize
import torch, nltk

# make sure sentence splitter is available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

device_id = 0 if torch.cuda.is_available() else -1

bart_pipe = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=device_id
)

# Your sample text here
# sample_text = dataset["test"][0]["article"]

# Hyperparameters (your spec + sensible defaults)
bart_output = bart_pipe(
    sample_text,
    truncation=True,         # ensure input is cut to model's 1024-token limit
    max_length=70,           # generated summary target length
    min_length=30,           # helps avoid overly short summaries
    num_beams=4,             # better quality than greedy; modest compute
    no_repeat_ngram_size=3,  # reduce repetition
    early_stopping=True
    # note: batch_size=16 only matters when passing a list of texts, e.g. bart_pipe([text1, text2], batch_size=16)
)

summaries["bart"] = "\n".join(sent_tokenize(bart_output[0]["summary_text"]))


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


**DistilBERT**

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from nltk.tokenize import sent_tokenize
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

distilbart_id = "sshleifer/distilbart-cnn-12-6"  # distilled BART for summarization
db_tok = AutoTokenizer.from_pretrained(distilbart_id, use_fast=True)
db_model = AutoModelForSeq2SeqLM.from_pretrained(distilbart_id)
db_pipe = pipeline("summarization", model=db_model, tokenizer=db_tok, device=0)

def summarize_long_with_distilbart(
    text, pipe=db_pipe, tok=db_tok,
    max_input_tokens=1024, chunk_buffer=32, stride_tokens=128,
    **gen_kwargs
):
    if not gen_kwargs:
        gen_kwargs = dict(num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3,
                          min_length=60, max_length=180)

    ids = tok.encode(text, add_special_tokens=False)
    chunk_len = max_input_tokens - chunk_buffer
    if len(ids) <= chunk_len:
        return pipe(text, **gen_kwargs)[0]["summary_text"].strip()

    # map-reduce: chunk -> summarize parts -> summarize the join
    chunks, i = [], 0
    while i < len(ids):
        window = ids[i:i+chunk_len]
        chunks.append(tok.decode(window, skip_special_tokens=True))
        if i + chunk_len >= len(ids): break
        i += max(1, chunk_len - stride_tokens)

    parts = [pipe(part, **gen_kwargs)[0]["summary_text"].strip() for part in chunks]
    final = pipe(" ".join(parts), **gen_kwargs)[0]["summary_text"].strip()
    return final

# Use it like your T5 pipeline:
db_out = summarize_long_with_distilbart(sample_text)
summaries["distilbart"] = "\n".join(sent_tokenize(db_out))


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Device set to use cpu


## PEGASUS

In [12]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize

# Pegasus summarization pipeline
pegasus_id = "google/pegasus-cnn_dailymail"
pegasus_pipe = pipeline("summarization", model=pegasus_id, device=0)

# Generate summary with hyperparameters
pegasus_out = pegasus_pipe(
    sample_text,
    max_length=200,         # Generated summary length cap
    min_length=50,          # Minimum length to avoid too short summaries
    truncation=True,        # Truncate input at 1024 tokens
    early_stopping=True,    # Stop when EOS is likely
    num_beams=4,            # Beam search (can adjust if needed)
    no_repeat_ngram_size=3, # Prevent repetition
    batch_size=2            # Matches your table
)

# Clean formatting
summaries["pegasus"] = "\n".join(sent_tokenize(
    pegasus_out[0]["summary_text"].replace(" .<n>", ". ")
))


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 200, but your input_length is only 192. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [13]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize

# Load the LED summarization model
led_id = "allenai/led-base-16384"
led_pipe = pipeline("summarization", model=led_id, device=0)

# Generate summary with hyperparameters
led_out = led_pipe(
    sample_text,
    max_length=150,         # summary max length
    min_length=50,          # reasonable lower bound
    truncation=True,        # truncate input to 16384 tokens
    early_stopping=True,    # stop when EOS token predicted
    num_beams=4,            # helps improve quality
    no_repeat_ngram_size=3, # avoid repetition
    batch_size=2
)

# Clean formatting
summaries["led"] = "\n".join(sent_tokenize(
    led_out[0]["summary_text"].replace(" .<n>", ". ")
))

# Print LED summary
print("=== LED SUMMARY ===")
print(summaries["led"])


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu
Input ids are automatically padded from 204 to 1024 to be a multiple of `config.attention_window`: 1024
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


=== LED SUMMARY ===
broadband uk gather pace one person uk join internet 's fast lane every 10 second accord bt telecom giant say number people broadband via telephone line surpass four million include connect via cable almost six million people fast always-on connection boom fuel fierce competition fall price well great availability broadband phone line take-up rate broadband accelerate terrific pace say ben verwaayen bt 's chief executive strong position hit five million target summer 2006 much early previously expect last million connection make past four month thousand people add total every day week sign broadband include get service direct bt via many company re-sell bt line name part surge people sign due bt stretch reach adsl uk 's widely used way get broadband beyond six kilometre asymmetric digital subscriber line technology let ordinary copper phone line support high data speed standard speed 512kbps though fast connection available accord btc 95 uk home business receive bro

In [14]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize

# Load the ProphetNet summarization model
prophetnet_pipe = pipeline("summarization", model="microsoft/prophetnet-large-uncased", device=0)

# Generate summary with tuned hyperparameters
pipe_output = prophetnet_pipe(
    sample_text,
    max_length=150,            # summary length cap
    min_length=40,             # encourage non-trivial summary
    num_beams=5,               # beam search
    no_repeat_ngram_size=3,    # avoid repetition
    early_stopping=True
)

# Store the summary in the summaries dictionary
summaries["prophetnet"] = "\n".join(sent_tokenize(pipe_output[0]["summary_text"].strip()))

# Print the ProphetNet summary
print("=== ProphetNet SUMMARY ===")
print(summaries["prophetnet"])


config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

prophetnet.tokenizer: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Device set to use cpu


=== ProphetNet SUMMARY ===
it ' s been a long time coming include a lot of people involved in the process of creating internet ' s fastest internet service available anywhere in the world include include us , uk , and canada included include us include canada include us includes canada include uk include us and canada include include uk includes us include us included in the list of people in the united states of america include many company re - sell many companies re - sale many company sell many company buy many companies buy many company get service get service buy company buy company sell company sell much company sell more company sell one company sell two company sell three company sell four company sell six company buy one company buy two company buy - sell company buy more company buy new company sell new company buy another company buy buy


In [15]:
from transformers import pipeline, set_seed
from nltk.tokenize import sent_tokenize

set_seed(42)

# Load GPT-2 XL for text generation
gpt2_pipe = pipeline("text-generation", model="gpt2-xl", device=0)

# Append a summarization prompt to the article text
gpt2_query = sample_text + "\nTL;DR:\n"

# Generate the output (summary) with tuned hyperparameters
gpt2_out = gpt2_pipe(
    gpt2_query,
    max_new_tokens=150,       # summary length cap
    num_beams=8,              # beam search for better quality
    early_stopping=True,      # stop at EOS
    no_repeat_ngram_size=3,   # avoid repetition
    batch_size=2,
    clean_up_tokenization_spaces=True
)

# Extract summary text after the TL;DR: prompt
summaries["gpt2"] = "\n".join(sent_tokenize(
    gpt2_out[0]["generated_text"][len(gpt2_query):].strip()
))

# Print the GPT-2 summary
print("=== GPT-2 SUMMARY ===")
print(summaries["gpt2"])


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


=== GPT-2 SUMMARY ===
The number of people in the UK who have access to a broadband connection of at least 1Mb/s is 1,7 million.
The total number of UK broadband customers is 4.5 million.
The number of broadband customers in the rest of the EU is 1.2 million.


In [16]:
print("GROUND TRUTH")
print(dataset["train"][1]["highlights"])
print("-"*100)
for model_name in summaries:
    print("##",model_name.upper(),'\n')
    print(summaries[model_name])
    print("-"*100)

GROUND TRUTH
turkcell 's mobile deal second turkish investment iran run trouble.the company say prepare accept minority stake iran award mobile deal.turkey 's investment iran 's mobile industry look set scrap big mobile firm saw investment slash mps.although company 's statement say would continue monitor development observer say think turkcell set pull 3bn deal.iran 's parliament vote large majority cut turkcell 's stake new mobile network 70 49
----------------------------------------------------------------------------------------------------
## BASELINE 

broadband uk gather pace one person uk join internet 's fast lane every 10 second accord bt telecom giant say number people broadband via telephone line surpass four million include connect via cable almost six million people fast always-on connection boom fuel fierce competition fall price well great availability broadband phone line take-up rate broadband accelerate terrific pace say ben verwaayen bt 's chief executive strong po

In [17]:
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud

## Evaluation with Rouge metric

In [18]:
!pip -q install evaluate bert-score transformers

from evaluate import load
import pandas as pd

# Use the same test index as used in `sample_text`
sample_index = 1
reference = dataset["test"][sample_index]["highlights"]

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
records = []

for model_name in summaries:
    rouge = load("rouge")
    prediction = summaries[model_name]
    rouge.add(prediction=prediction, reference=reference)
    score = rouge.compute()
    rouge_dict = {rn: score[rn] for rn in rouge_names}
    records.append(rouge_dict)

df = pd.DataFrame.from_records(records, index=summaries.keys())
print(df)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

Downloading builder script: 0.00B [00:00, ?B/s]

                   rouge1    rouge2    rougeL  rougeLsum
baseline         0.562249  0.518219  0.305221   0.305221
xlnet            0.297694  0.269474  0.159329   0.159329
gpt2             0.241935  0.032787  0.193548   0.129032
t5               0.473282  0.356589  0.366412   0.366412
bigbird_pegasus  0.106557  0.033058  0.098361   0.098361
bart             0.470588  0.358974  0.386555   0.386555
distilbart       0.488889  0.345865  0.355556   0.355556
pegasus          0.420168  0.324786  0.336134   0.336134
led              0.560000  0.508065  0.304000   0.304000
prophetnet       0.129032  0.037209  0.082949   0.082949


In [19]:
from evaluate import load
import torch, os, pandas as pd
!pip install bert_score

# metrics
rouge  = load("rouge")
bleu   = load("bleu")          # or "sacrebleu"
meteor = load("meteor")
bertscore = load("bertscore")  # requires `bert-score` pkg + model weights

BERTSCORE_MODEL = os.environ.get("BERTSCORE_MODEL", "roberta-base")

def compute_bertscore_f1(prediction: str, reference: str) -> float:
    # Guard against empties
    if not isinstance(prediction, str) or not prediction.strip():
        return 0.0
    if not isinstance(reference, str) or not reference.strip():
        return 0.0
    out = bertscore.compute(
        predictions=[prediction],
        references=[reference],
        lang="en",
        model_type=BERTSCORE_MODEL,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )
    return float(out["f1"][0])

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
records = []

sample_index = 1
reference = dataset["test"][sample_index]["highlights"]

for model_name, prediction in summaries.items():
    r = rouge.compute(predictions=[prediction], references=[reference], use_stemmer=True)
    b = bleu.compute(predictions=[prediction], references=[[reference]])
    m = meteor.compute(predictions=[prediction], references=[reference])
    try:
        bs_f1 = compute_bertscore_f1(prediction, reference)
    except Exception as e:
        print(f"[BERTScore error for {model_name}]:", e)
        bs_f1 = None

    row = {rn: r[rn] for rn in rouge_names}
    row.update({"bleu": b.get("bleu", b.get("score")), "meteor": m["meteor"], "bertscore_f1": bs_f1})
    records.append(row)

    print(f"=== {model_name.upper()} ===")
    print("Generated Summary:", prediction)
    print("Reference Summary:", reference)
    print("BLEU:", row["bleu"], " METEOR:", row["meteor"], " BERTScore(F1):", row["bertscore_f1"])
    print("-"*100)

df = pd.DataFrame.from_records(records, index=summaries.keys())
print(df)




Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== BASELINE ===
Generated Summary: broadband uk gather pace one person uk join internet 's fast lane every 10 second accord bt telecom giant say number people broadband via telephone line surpass four million include connect via cable almost six million people fast always-on connection boom fuel fierce competition fall price well great availability broadband phone line take-up rate broadband accelerate terrific pace say ben verwaayen bt 's chief executive strong position hit five million target summer 2006 much early previously expect last million connection make past four month thousand people add total every day week sign broadband include get service direct bt via many company re-sell bt line name part surge people sign due bt stretch reach adsl uk 's widely used way get broadband beyond six kilometre asymmetric digital subscriber line technology let ordinary copper phone line support high data speed standard speed 512kbps though fast connection available accord bt 95 uk home busin

In [20]:
# # --- Install deps (safe to re-run) ---
# !pip -q install --upgrade evaluate bert-score moverscore transformers

# import os, sys, traceback, torch
# import pandas as pd
# import numpy as np
# from evaluate import load

# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# # ---------- Ensure dataset exists (auto-load if missing) ----------
# try:
#     dataset  # noqa: F821
# except NameError:
#     print("[INFO] 'dataset' missing — loading CSV and creating splits...")
#     import pandas as _pd
#     from datasets import Dataset, DatasetDict
#     CSV_PATH = "/kaggle/input/reviews-summary-sentiment/Sentiment_summary_reviews.csv"
#     df = _pd.read_csv(CSV_PATH)
#     df = df[['Text', 'Summary']].rename(columns={'Text':'article','Summary':'highlights'})
#     df.dropna(subset=["article","highlights"], inplace=True)
#     df = df[df['article'].str.len()>30]
#     df = df[df['highlights'].str.len()>10]
#     def _clean(s): return " ".join(str(s).strip().split())
#     df["article"] = df["article"].map(_clean)
#     df["highlights"] = df["highlights"].map(_clean)
#     from datasets import Dataset, DatasetDict
#     hf_all = Dataset.from_pandas(df.reset_index(drop=True))
#     tmp = hf_all.train_test_split(test_size=0.2, seed=42)
#     tv = tmp["train"].train_test_split(test_size=0.125, seed=42)  # 10% val
#     dataset = DatasetDict({"train": tv["train"], "validation": tv["test"], "test": tmp["test"]})
#     print("[INFO] dataset ready:", dataset)

# # ---------- Ensure summaries exists ----------
# try:
#     summaries  # noqa: F821
# except NameError:
#     raise RuntimeError("Define `summaries` like {'baseline':'...', 'gpt2':'...', ...} for the SAME sample_index.")

# # ---------- Config ----------
# sample_index = 1
# reference = dataset["test"][sample_index]["highlights"]

# # ---------- Metrics ----------
# rouge = load("rouge")

# try:
#     sacrebleu = load("sacrebleu")
#     use_sacre_bleu = True
# except Exception:
#     sacrebleu = None
#     use_sacre_bleu = False
#     bleu = load("bleu")

# meteor = load("meteor")

# # BERTScore
# try:
#     bertscore = load("bertscore")
#     BERTSCORE_MODEL = os.environ.get("BERTSCORE_MODEL", "roberta-base")  # 'roberta-large' if you want
#     has_bertscore = True
# except Exception as e:
#     print("[WARN] BERTScore unavailable ->", e); bertscore=None; has_bertscore=False

# # ---------- MoverScore (patched) ----------
# try:
#     import moverscore_v2 as ms
#     from transformers import AutoTokenizer

#     # Choose a stable encoder (works well with moverscore_v2)
#     MOVER_MODEL = os.environ.get("MOVER_MODEL", "bert-base-uncased")

#     # Patch the module's global tokenizer so it has `max_len`
#     ms.tokenizer = AutoTokenizer.from_pretrained(MOVER_MODEL, use_fast=False)
#     if not hasattr(ms.tokenizer, "max_len"):
#         # For old moverscore code that expects `max_len`
#         ms.tokenizer.max_len = getattr(ms.tokenizer, "model_max_length", 512)

#     from moverscore_v2 import get_idf_dict, word_mover_score
#     has_moverscore = True
# except Exception as e:
#     print("[WARN] MoverScore unavailable ->", e)
#     ms = None; get_idf_dict=None; word_mover_score=None; has_moverscore=False

# def compute_bleu(pred, ref):
#     if use_sacre_bleu:
#         return float(sacrebleu.compute(predictions=[pred], references=[[ref]]).get("score",0.0))
#     else:
#         return float(bleu.compute(predictions=[pred], references=[ref]).get("bleu",0.0))

# def compute_bertscore_f1(pred, ref):
#     if not has_bertscore: return None
#     try:
#         out = bertscore.compute(predictions=[pred], references=[ref],
#                                 lang="en", model_type=BERTSCORE_MODEL, device=DEVICE)
#         return float(out["f1"][0])
#     except Exception as e:
#         print("[BERTScore error]", e); return None

# # Precompute reference IDF once; set nthreads=1 to avoid fork/pool issues
# _idf_ref = None
# if has_moverscore:
#     try:
#         _idf_ref = get_idf_dict([reference], nthreads=1)
#         print(f"[MoverScore] Using {MOVER_MODEL} on {DEVICE}")
#     except Exception as e:
#         print("[MoverScore init error]", e); traceback.print_exc(); _idf_ref=None

# def compute_moverscore(pred, ref):
#     if not has_moverscore or _idf_ref is None: return None
#     if not isinstance(pred, str) or not pred.strip(): return 0.0
#     try:
#         idf_hyp = get_idf_dict([pred], nthreads=1)
#         scores = word_mover_score(
#             [ref], [pred],
#             _idf_ref, idf_hyp,
#             stop_words=[], n_gram=1, remove_subwords=True,
#             batch_size=8, model_name=MOVER_MODEL, device=DEVICE
#         )
#         return float(scores[0])
#     except Exception as e:
#         print("[MoverScore error]", e)
#         # CPU fallback if GPU tokenizer/model causes issues
#         if DEVICE == "cuda":
#             try:
#                 scores = word_mover_score(
#                     [ref], [pred],
#                     _idf_ref, get_idf_dict([pred], nthreads=1),
#                     stop_words=[], n_gram=1, remove_subwords=True,
#                     batch_size=8, model_name=MOVER_MODEL, device="cpu"
#                 )
#                 print("[MoverScore] CPU fallback OK")
#                 return float(scores[0])
#             except Exception as e2:
#                 print("[MoverScore CPU fallback error]", e2)
#         return None

# # ---------- Evaluate all models ----------
# rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
# records = []

# print("-"*100)
# for model_name, prediction in summaries.items():
#     r = rouge.compute(predictions=[prediction], references=[reference], use_stemmer=True)
#     bleu_score = compute_bleu(prediction, reference)
#     m = meteor.compute(predictions=[prediction], references=[reference])
#     bert_f1 = compute_bertscore_f1(prediction, reference)
#     mover = compute_moverscore(prediction, reference)

#     row = {rn: r[rn] for rn in rouge_names}
#     row.update({"bleu": bleu_score, "meteor": m["meteor"], "bertscore_f1": bert_f1, "moverscore": mover})
#     records.append(row)

#     print(f"=== {model_name.upper()} ===")
#     print("ROUGE-1:", row["rouge1"], "| ROUGE-2:", row["rouge2"],
#           "| ROUGE-L:", row["rougeL"], "| ROUGE-Lsum:", row["rougeLsum"])
#     print("BLEU:", row["bleu"], "| METEOR:", row["meteor"],
#           "| BERTScore(F1):", row["bertscore_f1"], "| MoverScore:", row["moverscore"])
#     print("-"*100)

# df = pd.DataFrame.from_records(records, index=list(summaries.keys()))
# print(df)


In [21]:
# # --- Install deps (safe to re-run) ---
# !pip -q install --upgrade evaluate bert-score moverscore transformers

# import os, sys, traceback, torch
# import pandas as pd
# import numpy as np
# from evaluate import load

# # Silence parallelism warning
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

# # ---------- Ensure dataset exists (auto-load if missing) ----------
# try:
#     dataset  # noqa: F821
# except NameError:
#     print("[INFO] 'dataset' missing — loading CSV and creating splits...")
#     import pandas as _pd
#     from datasets import Dataset, DatasetDict
#     CSV_PATH = "/kaggle/input/cnn-dataser/cnn_dailymail_summary.csv"
#     df = _pd.read_csv(CSV_PATH)
#     df = df[['Text', 'Summary']].rename(columns={'Text':'article','Summary':'highlights'})
#     df.dropna(subset=["article","highlights"], inplace=True)
#     df = df[df['article'].str.len()>30]
#     df = df[df['highlights'].str.len()>10]
#     def _clean(s): return " ".join(str(s).strip().split())
#     df["article"] = df["article"].map(_clean)
#     df["highlights"] = df["highlights"].map(_clean)
#     from datasets import Dataset, DatasetDict
#     hf_all = Dataset.from_pandas(df.reset_index(drop=True))
#     tmp = hf_all.train_test_split(test_size=0.2, seed=42)
#     tv = tmp["train"].train_test_split(test_size=0.125, seed=42)  # 10% val
#     dataset = DatasetDict({"train": tv["train"], "validation": tv["test"], "test": tmp["test"]})
#     print("[INFO] dataset ready:", dataset)

# # ---------- Ensure summaries exists ----------
# try:
#     summaries  # noqa: F821
# except NameError:
#     raise RuntimeError("Define `summaries` like {'baseline':'...', 'gpt2':'...', ...} for the SAME sample_index.")

# # ---------- Config ----------
# sample_index = 1
# reference = dataset["test"][sample_index]["highlights"]

# # ---------- Metrics ----------
# rouge = load("rouge")
# try:
#     sacrebleu = load("sacrebleu"); use_sacre_bleu = True
# except Exception:
#     sacrebleu = None; use_sacre_bleu = False; bleu = load("bleu")
# meteor = load("meteor")

# # BERTScore (GPU ok)
# try:
#     bertscore = load("bertscore")
#     BERTSCORE_MODEL = os.environ.get("BERTSCORE_MODEL", "roberta-base")  # 'roberta-large' if needed
#     has_bertscore = True
# except Exception as e:
#     print("[WARN] BERTScore unavailable ->", e); bertscore=None; has_bertscore=False

# def compute_bleu(pred, ref):
#     if use_sacre_bleu:
#         return float(sacrebleu.compute(predictions=[pred], references=[[ref]]).get("score",0.0))
#     else:
#         return float(bleu.compute(predictions=[pred], references=[ref]).get("bleu",0.0))

# def compute_bertscore_f1(pred, ref):
#     if not has_bertscore: return None
#     try:
#         out = bertscore.compute(predictions=[pred], references=[ref],
#                                 lang="en", model_type=BERTSCORE_MODEL,
#                                 device="cuda" if torch.cuda.is_available() else "cpu")
#         return float(out["f1"][0])
#     except Exception as e:
#         print("[BERTScore error]", e); return None

# # ---------- MoverScore (v1.0.3-safe, CPU-only) ----------
# # IMPORTANT: do not pass 'model_name' or 'device' to word_mover_score in this version.
# try:
#     import moverscore_v2 as ms
#     from transformers import AutoTokenizer
#     from moverscore_v2 import get_idf_dict, word_mover_score

#     # Use a stable encoder and patch tokenizer.max_len expected by old code
#     MOVER_MODEL = os.environ.get("MOVER_MODEL", "bert-base-uncased")
#     ms.tokenizer = AutoTokenizer.from_pretrained(MOVER_MODEL, use_fast=False)
#     if not hasattr(ms.tokenizer, "max_len"):
#         ms.tokenizer.max_len = getattr(ms.tokenizer, "model_max_length", 512)

#     has_moverscore = True
# except Exception as e:
#     print("[WARN] MoverScore unavailable ->", e)
#     has_moverscore = False
#     get_idf_dict = None
#     word_mover_score = None

# # Build IDF for the single reference (nthreads=1 to avoid multiprocessing/fork issues)
# _idf_ref = None
# if has_moverscore:
#     try:
#         _idf_ref = get_idf_dict([reference], nthreads=1)  # v1.0.3 signature
#         print("[MoverScore] Ready (CPU-only) with model:", MOVER_MODEL)
#     except Exception as e:
#         print("[MoverScore init error]", e); traceback.print_exc(); _idf_ref=None

# def compute_moverscore(pred, ref):
#     """MoverScore for one pair using v1.0.3; CPU-only; no model_name/device kwargs."""
#     if not has_moverscore or _idf_ref is None:
#         return None
#     pred = (pred or "").strip()
#     ref  = (ref  or "").strip()
#     if not pred or not ref:
#         return 0.0
#     try:
#         idf_hyp = get_idf_dict([pred], nthreads=1)
#         # DO NOT pass device or model_name here (not supported in 1.0.3)
#         scores = word_mover_score(
#             [ref], [pred],
#             _idf_ref, idf_hyp,
#             stop_words=[], n_gram=1, remove_subwords=True,
#             batch_size=8
#         )
#         return float(scores[0])
#     except Exception as e:
#         print("[MoverScore error]", e)
#         return None

# # ---------- Evaluate all models ----------
# rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
# records = []

# print("-"*100)
# for model_name, prediction in summaries.items():
#     r = rouge.compute(predictions=[prediction], references=[reference], use_stemmer=True)
#     bleu_score = compute_bleu(prediction, reference)
#     m = meteor.compute(predictions=[prediction], references=[reference])
#     bert_f1 = compute_bertscore_f1(prediction, reference)
#     mover = compute_moverscore(prediction, reference)

#     row = {rn: r[rn] for rn in rouge_names}
#     row.update({"bleu": bleu_score, "meteor": m["meteor"], "bertscore_f1": bert_f1, "moverscore": mover})
#     records.append(row)

#     print(f"=== {model_name.upper()} ===")
#     print("ROUGE-1:", row["rouge1"], "| ROUGE-2:", row["rouge2"],
#           "| ROUGE-L:", row["rougeL"], "| ROUGE-Lsum:", row["rougeLsum"])
#     print("BLEU:", row["bleu"], "| METEOR:", row["meteor"],
#           "| BERTScore(F1):", row["bertscore_f1"], "| MoverScore:", row["moverscore"])
#     print("-"*100)

# df = pd.DataFrame.from_records(records, index=list(summaries.keys()))
# print(df)


In [22]:
# --- Use the GitHub version of MoverScore + robust wrapper ---
# (Safe to re-run. If you're offline, it will keep the existing install.)

try:
    import subprocess, sys
    # Uninstall old PyPI build that causes crashes
    subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "moverscore"], check=False)
    # Install latest master from GitHub (has sentence_score / corpus_score)
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/AIPHES/emnlp19-moverscore.git@master"], check=False)
except Exception as _e:
    print("[WARN] Could not (re)install moverscore from GitHub:", _e)

# --- Robust MoverScore wrapper (prefers sentence_score; no GPU complications) ---
import os, traceback
os.environ.setdefault("MOVERSCORE_MODEL", "bert-base-uncased")  # you can switch to e.g. 'albert-base-v2'

def compute_moverscore(prediction: str, reference: str):
    """
    Prefer moverscore_v2.sentence_score (stable). If unavailable, return None.
    Returns a float (higher is better) or None on failure.
    """
    try:
        from moverscore_v2 import sentence_score
        # sentence_score supports multi-refs; we pass a single reference
        sc = sentence_score(prediction, [reference])
        # Some versions return a dataclass with `.score`, others a float; handle both.
        return float(getattr(sc, "score", sc))
    except Exception as e:
        print("[MoverScore sentence_score error]", e)
        traceback.print_exc(limit=1)
        return None

# --- (Optional) quick sanity check ---
print("MoverScore sanity:", compute_moverscore("A fast brown fox leaps over a lazy dog.",
                                               "The quick brown fox jumps over the lazy dog."))




     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.6/78.6 kB 2.1 MB/s eta 0:00:00


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[MoverScore sentence_score error] Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx
MoverScore sanity: None


Traceback (most recent call last):
  File "/tmp/ipykernel_36/1894159147.py", line 23, in compute_moverscore
    from moverscore_v2 import sentence_score
RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx


In [23]:
# recompute the full table (ROUGE/BLEU/METEOR/BERTScore/MoverScore)
from evaluate import load
import pandas as pd
import torch, os

sample_index = 1
reference = dataset["test"][sample_index]["highlights"]

rouge = load("rouge")
try:
    sacrebleu = load("sacrebleu"); use_sacre_bleu = True
except Exception:
    sacrebleu = None; use_sacre_bleu = False; bleu = load("bleu")
meteor = load("meteor")

try:
    bertscore = load("bertscore"); BERTSCORE_MODEL = os.environ.get("BERTSCORE_MODEL", "roberta-base")
    has_bertscore = True
except Exception:
    has_bertscore = False

def compute_bleu(pred, ref):
    return (sacrebleu.compute(predictions=[pred], references=[[ref]])["score"]
            if use_sacre_bleu else
            load("bleu").compute(predictions=[pred], references=[ref])["bleu"])

def compute_bertscore_f1(pred, ref):
    if not has_bertscore: return None
    out = bertscore.compute(predictions=[pred], references=[ref],
                            lang="en",
                            model_type=BERTSCORE_MODEL,
                            device="cuda" if torch.cuda.is_available() else "cpu")
    return float(out["f1"][0])

rows = []
for name, pred in summaries.items():
    r = rouge.compute(predictions=[pred], references=[reference], use_stemmer=True)
    b = compute_bleu(pred, reference)
    m = meteor.compute(predictions=[pred], references=[reference])["meteor"]
    bs = compute_bertscore_f1(pred, reference)
   # mv = compute_moverscore(pred, reference)   # <-- uses the robust fallback you installed

    rows.append({
        "model": name,
        "rouge1": r["rouge1"], "rouge2": r["rouge2"], "rougeL": r["rougeL"], "rougeLsum": r["rougeLsum"],
        "bleu": b, "meteor": m, "bertscore_f1": bs})

df = pd.DataFrame(rows).set_index("model")
print(df)


Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                   rouge1    rouge2    rougeL  rougeLsum      bleu    meteor  \
model                                                                          
baseline         0.578313  0.534413  0.305221   0.305221  0.343942  0.659584   
xlnet            0.306080  0.277895  0.159329   0.159329  0.143381  0.468191   
gpt2             0.258065  0.049180  0.193548   0.161290  0.000000  0.099846   
t5               0.488550  0.387597  0.366412   0.366412  0.265538  0.388382   
bigbird_pegasus  0.122951  0.041322  0.098361   0.106557  0.000000  0.139861   
bart             0.487395  0.393162  0.386555   0.386555  0.237836  0.334667   
distilbart       0.503704  0.375940  0.355556   0.355556  0.249756  0.394896   
pegasus          0.420168  0.324786  0.336134   0.336134  0.164343  0.278918   
led              0.576000  0.524194  0.304000   0.304000  0.335420  0.658731   
prophetnet       0.138249  0.037209  0.082949   0.082949  0.000000  0.118381   

                 bertscore_f1  
model  

In [24]:
# ===============================
# Fixed evaluation (no BERTScore): ROUGE, BLEU/SacreBLEU, METEOR, MoverScore
# ===============================
# Works with: Hugging Face `evaluate`, `transformers`, and `pot` (optimal transport)
# Assumes: `dataset` (HF DatasetDict with a "test" split) and `summaries` dict already exist.

# ----- Installs (safe to re-run) -----
!pip -q install --upgrade evaluate transformers pot

# ----- Imports & setup -----
import os, sys, traceback, math
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from evaluate import load

os.environ["TOKENIZERS_PARALLELISM"] = "false"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ----- Check dataset & summaries availability -----
try:
    dataset  # noqa: F821
except NameError:
    # Fallback: load your CSV if dataset isn't in memory (adjust path as needed)
    print("[INFO] 'dataset' not found; loading from CSV fallback...")
    import pandas as _pd
    from datasets import Dataset, DatasetDict
    CSV_PATH = "/kaggle/input/bbc-news-dataset/bbc_news_summary.csv"
    df = _pd.read_csv(CSV_PATH)
    df = df[['Text', 'Summary']].rename(columns={'Text':'article','Summary':'highlights'})
    df.dropna(subset=["article","highlights"], inplace=True)
    df = df[df['article'].str.len()>30]
    df = df[df['highlights'].str.len()>10]
    def _clean(s): return " ".join(str(s).strip().split())
    df["article"] = df["article"].map(_clean)
    df["highlights"] = df["highlights"].map(_clean)
    hf_all = Dataset.from_pandas(df.reset_index(drop=True))
    tmp = hf_all.train_test_split(test_size=0.2, seed=42)
    tv  = tmp["train"].train_test_split(test_size=0.125, seed=42)  # 10% val
    dataset = DatasetDict({"train": tv["train"], "validation": tv["test"], "test": tmp["test"]})
    print("[INFO] dataset ready:", dataset)

try:
    summaries  # noqa: F821
except NameError:
    raise RuntimeError("Please define `summaries` as a dict: {'model_name': 'predicted summary text', ...}")

# ----- Config -----
sample_index = 1  # <-- set to the same index you used to generate `summaries`
reference = dataset["test"][sample_index]["highlights"]

# ===============================
# Robust MoverScore (no moverscore_v2)
# ===============================
# Uses sentence-level token embeddings from a transformer encoder and computes
# an Earth Mover's Distance under cosine-cost with IDF-like weights.
# Default encoder is 'roberta-large' for stronger scores; auto-falls back to 'roberta-base' if OOM.

from transformers import AutoTokenizer, AutoModel
import ot  # POT (Python Optimal Transport)

def _load_encoder(pref: str = "roberta-large"):
    try:
        tok = AutoTokenizer.from_pretrained(pref)
        mdl = AutoModel.from_pretrained(pref).eval().to(DEVICE)
        return tok, mdl, pref
    except Exception as e:
        print(f"[WARN] Failed to load {pref} ({e}); falling back to roberta-base.")
        tok = AutoTokenizer.from_pretrained("roberta-base")
        mdl = AutoModel.from_pretrained("roberta-base").eval().to(DEVICE)
        return tok, mdl, "roberta-base"

_tok, _enc_model, _enc_name = _load_encoder(os.environ.get("FALLBACK_MOVER_MODEL", "roberta-large"))
print(f"[MoverScore encoder] {_enc_name} on {DEVICE}")

_ref_cache = {}  # cache (embeddings, weights) per reference text

def _tokens_and_embs(text: str):
    if not isinstance(text, str) or not text.strip():
        return None, None
    enc_full = _tok(text,
                    return_tensors="pt",
                    return_special_tokens_mask=True,
                    truncation=True,
                    max_length=512)
    input_ids     = enc_full["input_ids"].to(DEVICE)
    attention_mask= enc_full["attention_mask"].to(DEVICE)
    with torch.no_grad():
        out = _enc_model(input_ids=input_ids, attention_mask=attention_mask)
        hs  = out.last_hidden_state.squeeze(0)  # [seq, hidden] on DEVICE

    # Build masks on CPU, then move the boolean indexer to hs.device
    special = enc_full["special_tokens_mask"].squeeze(0).bool()   # CPU
    mask    = enc_full["attention_mask"].squeeze(0).bool()        # CPU
    keep    = mask & (~special)                                   # CPU
    keep_dev= keep.to(hs.device)

    hs = hs[keep_dev]  # filter special tokens
    if hs.numel() == 0:
        return None, None

    # Normalize so cosine = dot product
    hs = F.normalize(hs, p=2, dim=1)

    toks = _tok.convert_ids_to_tokens(enc_full["input_ids"].squeeze(0)[keep].tolist())
    return toks, hs

def _idf_like_weights(tokens):
    # simple per-sentence IDF surrogate: 1/(1+tf)
    from collections import Counter
    c = Counter(tokens)
    w = torch.tensor([1.0/(1.0+c[t]) for t in tokens], dtype=torch.float32, device=DEVICE)
    w = w / (w.sum() + 1e-12)
    return w

def _emd_cosine_score(E_ref, E_hyp, w_ref, w_hyp):
    # cost = 1 - cosine(emb_i, emb_j)  (embeddings are normalized)
    C = 1.0 - torch.mm(E_ref, E_hyp.t())                           # [Nr, Nh] on DEVICE
    C = C.detach().cpu().numpy().astype(np.float64)
    a = w_ref.detach().cpu().numpy().astype(np.float64)
    b = w_hyp.detach().cpu().numpy().astype(np.float64)
    # EMD (Wasserstein-1 squared) via POT
    emd = ot.emd2(a, b, C)                                         # scalar
    return float(1.0 / (1.0 + emd))                                 # map distance -> [0,1]

def compute_moverscore(prediction: str, reference: str) -> float | None:
    """Stable MoverScore using transformer embeddings + OT; returns float or None."""
    pred = (prediction or "").strip()
    ref  = (reference or "").strip()
    if not pred or not ref:
        return None

    # cache reference once
    if ref not in _ref_cache:
        t_r, E_r = _tokens_and_embs(ref)
        if E_r is None:
            return None
        w_r = _idf_like_weights(t_r)
        _ref_cache[ref] = (E_r.detach(), w_r.detach())  # keep on DEVICE

    E_r, w_r = _ref_cache[ref]
    # Ensure cache tensors are on current DEVICE
    E_r = E_r.to(DEVICE)
    w_r = w_r.to(DEVICE)

    t_h, E_h = _tokens_and_embs(pred)
    if E_h is None:
        return None
    w_h = _idf_like_weights(t_h)

    return _emd_cosine_score(E_r, E_h, w_r, w_h)

# ---- sanity check ----
print("MoverScore sanity:",
      compute_moverscore("A fast brown fox leaps over a lazy dog.",
                         "The quick brown fox jumps over the lazy dog."))

# ===============================
# Other metrics: ROUGE, BLEU/SacreBLEU, METEOR
# ===============================
rouge = load("rouge")

# BLEU / SacreBLEU
try:
    sacrebleu = load("sacrebleu")
    use_sacre_bleu = True
except Exception:
    sacrebleu = None
    use_sacre_bleu = False
    bleu = load("bleu")

def compute_bleu(pred, ref):
    if use_sacre_bleu:
        return float(sacrebleu.compute(predictions=[pred], references=[[ref]]).get("score", 0.0))
    else:
        return float(bleu.compute(predictions=[pred], references=[ref]).get("bleu", 0.0))

# METEOR
meteor = load("meteor")

# ===============================
# Evaluate all model outputs in `summaries`
# ===============================
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
records = []

print("-" * 100)
for name, pred in summaries.items():
    # ROUGE
    r = rouge.compute(predictions=[pred], references=[reference], use_stemmer=True)
    row = {rn: r[rn] for rn in rouge_names}

    # BLEU
    row["bleu"] = compute_bleu(pred, reference)

    # METEOR
    row["meteor"] = meteor.compute(predictions=[pred], references=[reference])["meteor"]

    # MoverScore (robust)
    row["moverscore"] = compute_moverscore(pred, reference)

    records.append(row)

    print(f"=== {name.upper()} ===")
    print("ROUGE-1:", row["rouge1"], "| ROUGE-2:", row["rouge2"],
          "| ROUGE-L:", row["rougeL"], "| ROUGE-Lsum:", row["rougeLsum"])
    print("BLEU:", row["bleu"], "| METEOR:", row["meteor"],
          "| MoverScore:", row["moverscore"])
    print("-" * 100)

df = pd.DataFrame.from_records(records, index=list(summaries.keys()))
print(df)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.2/562.2 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[MoverScore encoder] roberta-large on cpu
MoverScore sanity: 0.9903908003092716


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


----------------------------------------------------------------------------------------------------
=== BASELINE ===
ROUGE-1: 0.5783132530120482 | ROUGE-2: 0.5344129554655871 | ROUGE-L: 0.30522088353413657 | ROUGE-Lsum: 0.30522088353413657
BLEU: 0.34394239843199864 | METEOR: 0.6595838747418761 | MoverScore: 0.9628136720491607
----------------------------------------------------------------------------------------------------
=== XLNET ===
ROUGE-1: 0.30607966457023067 | ROUGE-2: 0.27789473684210525 | ROUGE-L: 0.15932914046121593 | ROUGE-Lsum: 0.15932914046121593
BLEU: 0.14338090441907075 | METEOR: 0.46819060365053206 | MoverScore: 0.953999970589575
----------------------------------------------------------------------------------------------------
=== GPT2 ===
ROUGE-1: 0.25806451612903225 | ROUGE-2: 0.04918032786885246 | ROUGE-L: 0.1935483870967742 | ROUGE-Lsum: 0.16129032258064516
BLEU: 0.0 | METEOR: 0.0998463901689708 | MoverScore: 0.9350118399628018
---------------------------------

In [25]:
# # Combine BBC News + CNN/DailyMail to 2,200 rows (1,100 each)
# # Robust to column-name variants.

# import os
# import pandas as pd

# # ====== CONFIG ======
# BBC_CSV_PATH = "/kaggle/input/bbc-news-dataset/bbc_news_summary.csv"
# CNN_CSV_PATH = "/kaggle/input/cnn-dataser/cnn_dailymail_summary.csv"  # set to your file, or None to use HF
# SAMPLES_PER_DATASET = 1100
# OUTPUT_CSV = "/kaggle/working/bbc_cnn_2200.csv"
# RANDOM_STATE = 42

# def _clean_str(s):
#     return " ".join(str(s).strip().split())

# def _standardize_cols(df, article_candidates, summary_candidates):
#     # find actual column names case-insensitively
#     lower_map = {c.lower(): c for c in df.columns}
#     art_name = next((lower_map[c] for c in article_candidates if c in lower_map), None)
#     sum_name = next((lower_map[c] for c in summary_candidates if c in lower_map), None)
#     if not art_name or not sum_name:
#         raise ValueError(
#             f"Could not find article/summary columns.\n"
#             f"Looked for article in {article_candidates} and summary in {summary_candidates}.\n"
#             f"Found columns: {list(df.columns)}"
#         )
#     df = df[[art_name, sum_name]].rename(columns={art_name: "article", sum_name: "highlights"})
#     df.dropna(subset=["article", "highlights"], inplace=True)
#     df["article"] = df["article"].map(_clean_str)
#     df["highlights"] = df["highlights"].map(_clean_str)
#     # remove very short items
#     df = df[(df["article"].str.len() > 30) & (df["highlights"].str.len() > 10)]
#     return df

# # --------- Load BBC (handles both {Articles,Summaries} and {Text,Summary}) ----------
# def load_bbc(path: str) -> pd.DataFrame:
#     if not os.path.exists(path):
#         raise FileNotFoundError(f"BBC CSV not found at: {path}")
#     df = pd.read_csv(path)
#     df = _standardize_cols(
#         df,
#         article_candidates=("articles", "text"),      # accept either
#         summary_candidates=("summaries", "summary"),
#     )
#     df["source"] = "bbc"
#     return df

# # --------- Load CNN/DailyMail (handles 'article' or 'articles') ----------
# def load_cnn_dm(csv_path: str | None) -> pd.DataFrame:
#     if csv_path and os.path.exists(csv_path):
#         df = pd.read_csv(csv_path)
#         df = _standardize_cols(
#             df,
#             article_candidates=("article", "articles"),
#             summary_candidates=("highlights", "summary", "summaries"),
#         )
#         df["source"] = "cnn_dailymail"
#         return df
#     else:
#         # Fallback to Hugging Face if no CSV provided
#         from datasets import load_dataset
#         ds = load_dataset("cnn_dailymail", "3.0.0", split="train")
#         pdf = ds.to_pandas()
#         pdf = _standardize_cols(
#             pdf,
#             article_candidates=("article", "articles"),
#             summary_candidates=("highlights", "summary", "summaries"),
#         )
#         pdf["source"] = "cnn_dailymail"
#         return pdf

# # --------- Main combine logic ----------
# bbc = load_bbc(BBC_CSV_PATH)
# cnn = load_cnn_dm(CNN_CSV_PATH)

# # Ensure enough rows exist
# if len(bbc) < SAMPLES_PER_DATASET:
#     raise ValueError(f"BBC has only {len(bbc)} valid rows after cleaning; need {SAMPLES_PER_DATASET}.")
# if len(cnn) < SAMPLES_PER_DATASET:
#     raise ValueError(f"CNN/DM has only {len(cnn)} valid rows after cleaning; need {SAMPLES_PER_DATASET}.")

# # Sample 1,100 from each
# bbc_sample = bbc.sample(n=SAMPLES_PER_DATASET, random_state=RANDOM_STATE)
# cnn_sample = cnn.sample(n=SAMPLES_PER_DATASET, random_state=RANDOM_STATE)

# # Concatenate, shuffle, save
# combined = pd.concat([bbc_sample, cnn_sample], ignore_index=True)
# combined = combined.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)

# os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
# combined.to_csv(OUTPUT_CSV, index=False)

# print("Done!")
# print(f"BBC rows used: {len(bbc_sample)} | CNN/DM rows used: {len(cnn_sample)}")
# print(f"Combined shape: {combined.shape}")
# print(f"Saved to: {OUTPUT_CSV}")
# print("\nPreview:")
# print(combined.head(5))


In [26]:
# import os, re, time, warnings
# from typing import Dict, List, Tuple, Optional
# import torch
# from transformers import logging
# logging.set_verbosity_error()
# warnings.filterwarnings("ignore")
# import pandas as pd
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM

# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# # --- Use your ~100-word article here ---
# ARTICLE_TEXT = """
# Peshawar's Institute of Management Sciences (IMSciences) has been active with student-focused initiatives. 
# The university extended its application deadline for all programs, giving applicants more time to apply. 
# It also hosted a business fest where students presented innovative projects linked to the Sustainable Development Goals (SDGs). 
# These efforts reflect IMSciences' focus on academic excellence, entrepreneurship, and community engagement. 
# The university continues to improve its academic programs and campus facilities to provide a supportive learning environment, 
# while a video from earlier years highlights the vibrant campus and student life at IMSciences.
# """.strip()

# CSV_PATH = "imsciences_qualitative_results_short.csv"

# MODEL_IDS: Dict[str, str] = {
#     "BART": "facebook/bart-large-cnn",
#     "DistilBART": "sshleifer/distilbart-cnn-12-6",
#     "T5": "t5-base",
#     "PEGASUS": "google/pegasus-cnn_dailymail",
#     "LED": "allenai/led-base-16384",
#     "BigBird-Pegasus": "google/bigbird-pegasus-large-pubmed",  # still off-domain but ok to include
#     "GPT-2": "gpt2",                      # baseline (not a summarizer)
#     "XLNet": "xlnet-base-cased",          # baseline (not a summarizer)
# }

# # ---- STRICT LENGTH TARGETS (tweak here) ----
# MAX_SENTENCES = 4          # hard cap on sentences
# MAX_WORDS = 80             # hard cap on words (after sentence cap)
# MIN_NEW_TOKENS = 35        # model-level min generation
# MAX_NEW_TOKENS = 90        # model-level max generation (tight!)
# LENGTH_PENALTY = 2.0       # >1 penalizes long outputs
# NO_REPEAT_NGRAM = 3

# # ---------- helpers ----------
# def _clean_summary(s: str) -> str:
#     s = s.replace("<n>", " ")
#     s = re.sub(r"\s+", " ", s).strip()
#     s = re.sub(r"(\b\w+\b)(?:\s+\1\b){3,}", r"\1", s, flags=re.I)  # collapse w w w w
#     return s

# def _limit_length(text: str, max_sentences=MAX_SENTENCES, max_words=MAX_WORDS) -> str:
#     # keep only first N sentences
#     sents = re.split(r'(?<=[.!?])\s+', text.strip())
#     sents = [s for s in sents if s]
#     sents = sents[:max_sentences]
#     clipped = " ".join(sents)
#     # enforce word cap
#     words = clipped.split()
#     if len(words) > max_words:
#         clipped = " ".join(words[:max_words]).rstrip(",;:") + "..."
#     return clipped

# def _flags(text: str) -> dict:
#     rep_triplets = bool(re.search(r"(\b\w+\b)(?:\s+\1\b){2,}", text, flags=re.I))
#     long_run = max([len(run) for run in re.split(r"[.!?]", text)]) > 300 if text else False
#     has_code = ("@" in text) or ("<" in text and ">" in text)
#     return {"flag_repetition": int(rep_triplets), "flag_long_run": int(long_run), "flag_code_tokens": int(has_code)}

# def _chunk(tok, text: str, max_input_tokens: Optional[int] = None, margin: int = 64):
#     if max_input_tokens is None:
#         raw = getattr(tok, "model_max_length", 1024)
#         max_input_tokens = min(int(raw), 16384)
#     enc = tok(text, return_tensors="pt", truncation=False, add_special_tokens=True)
#     ids = enc["input_ids"][0]; attn = enc["attention_mask"][0]
#     limit = max(128, max_input_tokens - margin)
#     batches = []
#     for st in range(0, ids.shape[0], limit):
#         ed = min(st + limit, ids.shape[0])
#         batches.append({"input_ids": ids[st:ed].unsqueeze(0).to(DEVICE),
#                         "attention_mask": attn[st:ed].unsqueeze(0).to(DEVICE)})
#     return batches

# @torch.inference_mode()
# def _summarize_seq2seq(model, tok, article: str, max_in: int,
#                        min_new=MIN_NEW_TOKENS, max_new=MAX_NEW_TOKENS,
#                        beams=4, led_global=False, repetition_penalty=1.15):
#     chunks = _chunk(tok, article, max_input_tokens=max_in)
#     total_in = total_out = 0
#     parts = []
#     for b in chunks:
#         total_in += int(b["input_ids"].shape[-1])
#         kwargs = dict(
#             min_new_tokens=min_new, max_new_tokens=max_new,
#             num_beams=beams, length_penalty=LENGTH_PENALTY,
#             no_repeat_ngram_size=NO_REPEAT_NGRAM,
#             encoder_no_repeat_ngram_size=NO_REPEAT_NGRAM,
#             early_stopping=True, repetition_penalty=repetition_penalty
#         )
#         if led_global:
#             g = torch.zeros_like(b["attention_mask"]); g[:, 0] = 1
#             ids = model.generate(input_ids=b["input_ids"], attention_mask=b["attention_mask"],
#                                  global_attention_mask=g, **kwargs)
#         else:
#             ids = model.generate(input_ids=b["input_ids"], attention_mask=b["attention_mask"], **kwargs)
#         total_out += int(ids.shape[-1])
#         parts.append(_clean_summary(tok.decode(ids[0], skip_special_tokens=True)))

#     out = " ".join(parts).strip()

#     # If multiple chunks → compress again, but with even stricter cap
#     if len(chunks) > 1:
#         kwargs2 = dict(
#             min_new_tokens=max(10, min_new // 2),
#             max_new_tokens=max(40, max_new // 2),
#             num_beams=beams, length_penalty=LENGTH_PENALTY,
#             no_repeat_ngram_size=NO_REPEAT_NGRAM,
#             encoder_no_repeat_ngram_size=NO_REPEAT_NGRAM,
#             early_stopping=True, repetition_penalty=repetition_penalty
#         )
#         inp = tok(out, return_tensors="pt", truncation=True, max_length=max_in).to(DEVICE)
#         ids = model.generate(**inp, **kwargs2)
#         total_out += int(ids.shape[-1])
#         out = _clean_summary(tok.decode(ids[0], skip_special_tokens=True))

#     # Hard final clip
#     out = _limit_length(out)
#     return out, total_in, total_out

# @torch.inference_mode()
# def _summarize_causal(model, tok, article: str, max_input=1024,
#                       min_new=MIN_NEW_TOKENS, max_new=MAX_NEW_TOKENS, repetition_penalty=1.2):
#     prompt = (
#         "Summarize the news article in 3–4 concise sentences. Be factual and avoid repetition.\n\n"
#         f"Article:\n{article}\n\nSummary:"
#     )
#     inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=max_input).to(DEVICE)
#     eos = tok.eos_token_id if tok.eos_token_id is not None else 50256
#     ids = model.generate(
#         **inputs, do_sample=False,
#         min_new_tokens=min_new, max_new_tokens=max_new,
#         length_penalty=LENGTH_PENALTY, no_repeat_ngram_size=NO_REPEAT_NGRAM,
#         repetition_penalty=repetition_penalty, pad_token_id=eos, eos_token_id=eos
#     )
#     text = tok.decode(ids[0], skip_special_tokens=True)
#     out = text.split("Summary:", 1)[-1].strip()
#     if "." in out: out = out.rsplit(".", 1)[0] + "."
#     out = _clean_summary(out)
#     out = _limit_length(out)
#     return out, int(inputs["input_ids"].shape[-1]), int(ids.shape[-1])

# def run_panel(article: str) -> pd.DataFrame:
#     rows = []
#     for name, model_id in MODEL_IDS.items():
#         print(f"\n=== {name} ({model_id}) ===")
#         t0 = time.time()
#         tok = AutoTokenizer.from_pretrained(model_id)
#         max_in = getattr(tok, "model_max_length", 1024)
#         if max_in > 2_000_000_000: max_in = 4096 if "led" in model_id.lower() else 1024

#         # try seq2seq first; fall back to causal
#         is_seq2seq = True
#         try:
#             mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(DEVICE)
#         except Exception:
#             is_seq2seq = False
#             mdl = AutoModelForCausalLM.from_pretrained(model_id).to(DEVICE)

#         if is_seq2seq:
#             text = f"summarize: {article}" if "t5" in model_id.lower() else article
#             led_mode = ("led" in model_id.lower())
#             summary, tin, tout = _summarize_seq2seq(mdl, tok, text, max_in)
#         else:
#             summary, tin, tout = _summarize_causal(mdl, tok, article, max_input=min(max_in, 1024))

#         dt = round(time.time() - t0, 2)
#         flags = _flags(summary)
#         rows.append({
#             "model": name, "model_id": model_id, "time_sec": dt,
#             "tokens_in": tin, "tokens_out": tout, **flags, "summary": summary
#         })
#     cols = ["model","model_id","time_sec","tokens_in","tokens_out",
#             "flag_repetition","flag_long_run","flag_code_tokens","summary"]
#     return pd.DataFrame(rows, columns=cols)

# df_short = run_panel(ARTICLE_TEXT)
# df_short.to_csv(CSV_PATH, index=False, encoding="utf-8")
# print(f"\n[OK] CSV saved → {os.path.abspath(CSV_PATH)}")
# df_short


In [34]:
# ===============================
# Install dependencies (Kaggle safe)
# ===============================
!pip install --quiet nltk rouge-score bert-score

# ===============================
# Imports
# ===============================
import pandas as pd
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# ===============================
# Load CSV safely with fallback encoding
# ===============================
INPUT_CSV = "/kaggle/input/statistical/statistical.csv"   # <-- change path
try:
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")
except UnicodeDecodeError:
    try:
        df = pd.read_csv(INPUT_CSV, encoding="latin1")
    except:
        df = pd.read_csv(INPUT_CSV, encoding="ISO-8859-1")

print(f"[INFO] Loaded dataset: {INPUT_CSV}  shape={df.shape}")
print(f"[INFO] Columns: {list(df.columns)}")

# ===============================
# Utility Functions
# ===============================

# ROUGE per-document (returns dicts)
def rouge_per_doc(preds, refs):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []
    for p, r in zip(preds, refs):
        res = scorer.score(r, p)
        scores.append({
            "rouge1": res['rouge1'].fmeasure,
            "rouge2": res['rouge2'].fmeasure,
            "rougeL": res['rougeL'].fmeasure
        })
    return scores

# BLEU per-document
def bleu_per_doc(preds, refs):
    smooth = SmoothingFunction().method1
    return [sentence_bleu([r.split()], p.split(), smoothing_function=smooth) for p, r in zip(preds, refs)]

# METEOR per-document (tokenized fix)
def meteor_per_doc(preds, refs):
    scores = []
    for p, r in zip(preds, refs):
        r_tok, p_tok = word_tokenize(r), word_tokenize(p)
        scores.append(single_meteor_score(r_tok, p_tok))
    return scores

# BERTScore per-document
def bert_f1_per_doc(preds, refs, lang="en"):
    P, R, F1 = bert_score(preds, refs, lang=lang, verbose=False)
    return F1.tolist()

# ===============================
# Compute Scores
# ===============================
models = ["LED", "DistilBART", "BART", "T5"]   # pick your models from CSV
records = []

for _, row in df.iterrows():
    ref = str(row["reference"])  # column for gold summary
    for m in models:
        hyp = str(row[m])
        records.append({"doc_id": row.get("doc_id", _), "model": m, "reference": ref, "prediction": hyp})

df_long = pd.DataFrame(records)

# Compute metrics grouped by doc/model
results = []
for model in models:
    subset = df_long[df_long["model"] == model]
    preds, refs = subset["prediction"].tolist(), subset["reference"].tolist()

    # Compute all metrics
    rouge_scores = rouge_per_doc(preds, refs)
    bleu_scores = bleu_per_doc(preds, refs)
    meteor_scores = meteor_per_doc(preds, refs)
    bert_scores = bert_f1_per_doc(preds, refs)

    # Build DataFrame
    part = pd.DataFrame(rouge_scores)
    part["BLEU"] = bleu_scores
    part["METEOR"] = meteor_scores
    part["BERTScore"] = bert_scores
    part["model"] = model
    results.append(part)

df_metrics = pd.concat(results, ignore_index=True)

# ===============================
# Save + Preview
# ===============================
df_metrics.to_csv("per_document_metrics.csv", index=False)
print("[INFO] Saved per-document metrics → per_document_metrics.csv")
df_metrics.head()


[INFO] Loaded dataset: /kaggle/input/statistical/statistical.csv  shape=(48, 6)
[INFO] Columns: ['doc_id', 'reference', 'LED', 'DistilBART', 'BART', 'T5']


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

[INFO] Saved per-document metrics → per_document_metrics.csv


Unnamed: 0,rouge1,rouge2,rougeL,BLEU,METEOR,BERTScore,model
0,0.923077,0.545455,0.769231,0.194331,0.851201,0.906519,LED
1,0.461538,0.181818,0.307692,0.034557,0.399525,0.950372,LED
2,0.363636,0.0,0.363636,0.043989,0.37037,0.88231,LED
3,0.461538,0.181818,0.461538,0.080876,0.399525,0.912072,LED
4,0.5,0.333333,0.5,0.058335,0.440613,0.906364,LED


In [35]:
# =========================================================
# Significance testing + summary from per-document metrics
# Works with columns: ['rouge1','rouge2','rougeL','BLEU','METEOR','BERTScore','model']
# and one row per (doc, model). If you have a 'doc_id' column, it's used automatically.
# =========================================================
import os, numpy as np, pandas as pd
from itertools import combinations
from scipy.stats import ttest_rel, wilcoxon

# ---- Load metrics table ----
if 'df_metrics' in globals():
    dfm = df_metrics.copy()
else:
    # Try the file saved by your previous step
    path = 'per_document_metrics.csv'
    if not os.path.exists(path):
        raise FileNotFoundError("Could not find per_document_metrics.csv and df_metrics is not defined.")
    dfm = pd.read_csv(path)

# Normalize column names (just in case)
dfm.columns = [c.strip() for c in dfm.columns]
# Try to guess doc_id if missing
if 'doc_id' not in dfm.columns:
    dfm['doc_id'] = dfm.groupby('model').cumcount() + 1

# Metrics present (harmonize names)
name_map = {
    'rouge1':'ROUGE1','rouge2':'ROUGE2','rougeL':'ROUGEL',
    'BLEU':'BLEU','METEOR':'METEOR','BERTScore':'BERTScoreF1',
    'ROUGE-1':'ROUGE1','ROUGE-2':'ROUGE2','ROUGE-L':'ROUGEL'
}
dfm = dfm.rename(columns={k:v for k,v in name_map.items() if k in dfm.columns})
METRICS = [m for m in ['ROUGE1','ROUGE2','ROUGEL','BLEU','METEOR','BERTScoreF1'] if m in dfm.columns]

MODELS = sorted(dfm['model'].unique().tolist())
print(f"[INFO] Models: {MODELS}")
print(f"[INFO] Metrics: {METRICS}")
n_docs = dfm['doc_id'].nunique()
print(f"[INFO] Number of documents: {n_docs}")

# ---- Helper: paired Cohen's d ----
def cohens_d_paired(a, b):
    # a,b are paired arrays
    diff = a - b
    sd = diff.std(ddof=1)
    if sd == 0:
        return 0.0
    return diff.mean() / sd

# ---- Pairwise tests for all model pairs and metrics ----
rows = []
for metric in METRICS:
    for m1, m2 in combinations(MODELS, 2):
        a = (dfm[dfm['model']==m1]
             .sort_values('doc_id')[metric].astype(float).to_numpy())
        b = (dfm[dfm['model']==m2]
             .sort_values('doc_id')[metric].astype(float).to_numpy())
        # Align lengths if anything went off (shouldn't, but safe):
        n = min(len(a), len(b))
        a, b = a[:n], b[:n]
        # T-test
        t_stat, p_t = ttest_rel(a, b)
        # Wilcoxon (may fail if all diffs are zero)
        try:
            w_stat, p_w = wilcoxon(a, b, zero_method='wilcox')
        except ValueError:
            w_stat, p_w = np.nan, np.nan
        d = cohens_d_paired(a, b)
        rows.append({
            'metric': metric, 'model_A': m1, 'model_B': m2,
            'n_docs': n, 'mean_A': float(a.mean()), 'mean_B': float(b.mean()),
            'cohens_d(A-B)': float(d),
            't_stat': float(t_stat), 'p_t': float(p_t),
            'w_stat': (np.nan if isinstance(w_stat,float) and np.isnan(w_stat) else float(w_stat)),
            'p_w': (np.nan if isinstance(p_w,float) and np.isnan(p_w) else float(p_w)),
        })
sig_df = pd.DataFrame(rows)

# ---- Multiple-comparison correction (Holm) per metric ----
def holm_bonferroni(pvals):
    # returns adjusted p-values in original order
    m = len(pvals)
    order = np.argsort(pvals)
    adj = np.empty(m)
    prev = 0.0
    for i, idx in enumerate(order):
        rank = m - i
        adj_val = pvals[idx] * rank
        adj_val = max(adj_val, prev)  # ensure monotonicity
        adj[idx] = min(adj_val, 1.0)
        prev = adj[idx]
    return adj

sig_df['pmin'] = np.nanmin(sig_df[['p_t','p_w']].values, axis=1)
adj_rows = []
for metric, g in sig_df.groupby('metric'):
    adj = holm_bonferroni(g['pmin'].values)
    tmp = g.copy()
    tmp['pmin_holm'] = adj
    adj_rows.append(tmp)
sig_df = pd.concat(adj_rows, ignore_index=True)

# Significance flags
sig_df['sig_t(p<0.05)'] = sig_df['p_t'] < 0.05
sig_df['sig_w(p<0.05)'] = sig_df['p_w'] < 0.05
sig_df['sig_holm(p<0.05)'] = sig_df['pmin_holm'] < 0.05

sig_df = sig_df.sort_values(['metric','pmin_holm']).reset_index(drop=True)
sig_df.to_csv('significance_results.csv', index=False)
print("[OK] Saved -> significance_results.csv")
display(sig_df.head(12))

# ---- Summary table: mean ± sd per model/metric ----
summ_rows = []
for metric in METRICS:
    for m in MODELS:
        vals = (dfm[dfm['model']==m]
                .sort_values('doc_id')[metric].astype(float).to_numpy())
        summ_rows.append({
            'metric': metric,
            'model': m,
            'mean': float(vals.mean()),
            'std': float(vals.std(ddof=1)),
            'median': float(np.median(vals))
        })
summary_df = pd.DataFrame(summ_rows).sort_values(['metric','mean'], ascending=[True, False])
summary_df.to_csv('model_metric_summary.csv', index=False)
print("[OK] Saved -> model_metric_summary.csv")
display(summary_df.head(12))

# ---- Quick, human-readable recap by metric: who is best and who it beats after Holm ----
print("\n=== Compact recap (Holm-corrected, using min p of t/Wilcoxon) ===")
for metric in METRICS:
    best = summary_df[summary_df['metric']==metric].iloc[0]
    best_model = best['model']
    best_mean = best['mean']
    print(f"\n[{metric}] best mean = {best_mean:.3f} → {best_model}")
    g = sig_df[(sig_df['metric']==metric)]
    wins = []
    for _, r in g.iterrows():
        # If pair includes best, check if best is A or B and whether adj p < .05
        if best_model in (r['model_A'], r['model_B']) and r['sig_holm(p<0.05)']:
            other = r['model_B'] if r['model_A']==best_model else r['model_A']
            # Determine direction (is best actually higher on average?)
            if best_model == r['model_A'] and r['mean_A'] >= r['mean_B']:
                wins.append(other)
            elif best_model == r['model_B'] and r['mean_B'] >= r['mean_A']:
                wins.append(other)
    if wins:
        print(f"  Significant vs: {', '.join(sorted(set(wins)))} (Holm p<.05)")
    else:
        print("  No significant wins after Holm correction.")

# ---- Ready-to-paste one-liners (uncorrected and corrected) ----
print("\n=== Suggested sentences ===")
for _, r in sig_df.iterrows():
    direction = "better" if r['mean_A'] > r['mean_B'] else "worse"
    print(f"- {r['model_A']} vs {r['model_B']} on {r['metric']}: "
          f"{direction}; paired t p={r['p_t']:.4f}, Wilcoxon p={r['p_w']:.4f}, "
          f"Holm-adjusted p={r['pmin_holm']:.4f}, d={r['cohens_d(A-B)']:.2f} (n={int(r['n_docs'])}).")

# ---- (Optional) LaTeX table of p-values vs a chosen 'best' model ----
BEST_MODEL = summary_df.groupby('metric').apply(lambda g: g.iloc[0]['model']).to_dict()
# If you prefer to force one best model, set e.g.: BEST_MODEL = {m: 'LED' for m in METRICS}

latex_blocks = []
for metric in METRICS:
    chosen = BEST_MODEL[metric] if isinstance(BEST_MODEL, dict) else BEST_MODEL
    block = sig_df[(sig_df['metric']==metric) & ((sig_df['model_A']==chosen)|(sig_df['model_B']==chosen))].copy()
    block['vs'] = np.where(block['model_A']==chosen, block['model_B'], block['model_A'])
    block = block[['vs','mean_A','mean_B','p_t','p_w','pmin_holm','cohens_d(A-B)']]
    block = block.rename(columns={'p_t':'t_p','p_w':'w_p','pmin_holm':'holm_p','cohens_d(A-B)':'cohen_d'})
    block = block.sort_values('holm_p')
    # Build LaTeX
    lines = [f"\\begin{{table}}[ht]\\centering",
             f"\\caption{{Significance vs {chosen} on {metric}}}",
             f"\\label{{tab:sig_{metric.lower()}}}",
             f"\\begin{{tabular}}{{lrrrrr}}\\toprule",
             f"Model & t p & Wilcoxon p & Holm p & d (paired) \\\\ \\midrule"]
    for _, r in block.iterrows():
        lines.append(f"{r['vs']} & {r['t_p']:.4f} & {r['w_p']:.4f} & {r['holm_p']:.4f} & {r['cohen_d']:.2f} \\\\")
    lines += ["\\bottomrule","\\end{tabular}","\\end{table}"]
    latex_blocks.append("\n".join(lines))

print("\n=== LaTeX blocks (copy one per metric if you want) ===")
for lb in latex_blocks:
    print(lb, "\n")


[INFO] Models: ['BART', 'DistilBART', 'LED', 'T5']
[INFO] Metrics: ['ROUGE1', 'ROUGE2', 'ROUGEL', 'BLEU', 'METEOR', 'BERTScoreF1']
[INFO] Number of documents: 48
[OK] Saved -> significance_results.csv


  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * r, axis=-1)
  r_minus = np.sum((d < 0) * r, axis=-1)
  r_plus = np.sum((d > 0) * 

Unnamed: 0,metric,model_A,model_B,n_docs,mean_A,mean_B,cohens_d(A-B),t_stat,p_t,w_stat,p_w,pmin,pmin_holm,sig_t(p<0.05),sig_w(p<0.05),sig_holm(p<0.05)
0,BERTScoreF1,DistilBART,LED,48,0.890452,0.904591,-0.658865,-4.56475,3.6e-05,215.0,6.7e-05,3.6e-05,0.000216,True,True,True
1,BERTScoreF1,LED,T5,48,0.904591,0.891256,0.595294,4.124318,0.00015,235.0,0.000177,0.00015,0.000752,True,True,True
2,BERTScoreF1,BART,DistilBART,48,0.900725,0.890452,0.39952,2.767958,0.008045,297.0,0.002364,0.002364,0.009456,True,True,True
3,BERTScoreF1,BART,T5,48,0.900725,0.891256,0.442836,3.068059,0.00357,307.0,0.003399,0.003399,0.010197,True,True,True
4,BERTScoreF1,BART,LED,48,0.900725,0.904591,-0.173634,-1.202974,0.235011,432.0,0.111261,0.111261,0.222521,False,False,False
5,BERTScoreF1,DistilBART,T5,48,0.890452,0.891256,-0.036017,-0.249533,0.804036,568.0,0.842918,0.804036,0.804036,False,False,False
6,BLEU,DistilBART,LED,48,0.015414,0.036414,-0.43689,-3.026862,0.004002,70.0,0.007335,0.004002,0.02401,True,True,True
7,BLEU,BART,DistilBART,48,0.031646,0.015414,0.377064,2.612374,0.012035,80.0,0.045247,0.012035,0.060177,True,True,False
8,BLEU,LED,T5,48,0.036414,0.017008,0.364222,2.523404,0.015063,102.0,0.021268,0.015063,0.060252,True,True,False
9,BLEU,BART,T5,48,0.031646,0.017008,0.332211,2.301624,0.025836,96.0,0.025377,0.025377,0.076131,True,True,False


[OK] Saved -> model_metric_summary.csv


Unnamed: 0,metric,model,mean,std,median
22,BERTScoreF1,LED,0.904591,0.021502,0.904531
20,BERTScoreF1,BART,0.900725,0.027169,0.899023
23,BERTScoreF1,T5,0.891256,0.02459,0.889325
21,BERTScoreF1,DistilBART,0.890452,0.025618,0.886695
14,BLEU,LED,0.036414,0.048922,0.032254
12,BLEU,BART,0.031646,0.042345,0.0
15,BLEU,T5,0.017008,0.026475,0.0
13,BLEU,DistilBART,0.015414,0.030383,0.0
18,METEOR,LED,0.382654,0.15501,0.421524
16,METEOR,BART,0.296794,0.13794,0.25918



=== Compact recap (Holm-corrected, using min p of t/Wilcoxon) ===

[ROUGE1] best mean = 0.462 → LED
  Significant vs: DistilBART, T5 (Holm p<.05)

[ROUGE2] best mean = 0.215 → LED
  Significant vs: BART, DistilBART, T5 (Holm p<.05)

[ROUGEL] best mean = 0.439 → LED
  Significant vs: BART, DistilBART, T5 (Holm p<.05)

[BLEU] best mean = 0.036 → LED
  Significant vs: DistilBART (Holm p<.05)

[METEOR] best mean = 0.383 → LED
  Significant vs: BART, DistilBART, T5 (Holm p<.05)

[BERTScoreF1] best mean = 0.905 → LED
  Significant vs: DistilBART, T5 (Holm p<.05)

=== Suggested sentences ===
- DistilBART vs LED on BERTScoreF1: worse; paired t p=0.0000, Wilcoxon p=0.0001, Holm-adjusted p=0.0002, d=-0.66 (n=48).
- LED vs T5 on BERTScoreF1: better; paired t p=0.0002, Wilcoxon p=0.0002, Holm-adjusted p=0.0008, d=0.60 (n=48).
- BART vs DistilBART on BERTScoreF1: better; paired t p=0.0080, Wilcoxon p=0.0024, Holm-adjusted p=0.0095, d=0.40 (n=48).
- BART vs T5 on BERTScoreF1: better; paired t p=0.0

  BEST_MODEL = summary_df.groupby('metric').apply(lambda g: g.iloc[0]['model']).to_dict()


In [36]:
BEST_MODEL = {m: 'LED' for m in METRICS}
