In [None]:
# =============================================================
# Pegasus-Large Summarization ‚Äî MultiNews (Session-only ZIP)
# =============================================================

!pip install transformers datasets rouge-score accelerate --quiet

import torch, gc, os, pandas as pd, sys, zipfile
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# =============================================================
# Config (ONLY paths + dataset adapted)
# =============================================================
MODEL_NAME = "google/pegasus-large"
BATCH_SIZE = 10
CHUNK_SIZE = 200
LIMIT = 15000
SPLIT = "val"

ZIP_PATH = "/content/multi_news.zip"              # üëà already in session
EXTRACT_DIR = "/content/data_multinews"
SAVE_PATH = f"/content/summaries_pegasus_multinews_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("‚úÖ Using device:", device)

# =============================================================
# Load model + tokenizer safely
# =============================================================
torch.cuda.empty_cache(); gc.collect()
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)

try:
    model = PegasusForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    ).to(device)
except RuntimeError:
    print("‚ö†Ô∏è GPU OOM during model load ‚Üí switching to CPU")
    model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME).to("cpu")
    device = torch.device("cpu")

model.eval()

# =============================================================
# Load MultiNews from ZIP (session only)
# =============================================================
if not os.path.exists(EXTRACT_DIR):
    print("üì¶ Extracting MultiNews ZIP...")
    with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("‚úÖ Extraction complete!")
else:
    print("‚ôªÔ∏è Using already extracted MultiNews folder.")

# Find src + tgt files
src_file, tgt_file = None, None
for root, _, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f and f.endswith(".cleaned"):
            src_file = os.path.join(root, f)
        elif f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

if not src_file or not tgt_file:
    raise ValueError("‚ùå Could not find MultiNews source/target files")

print(f"‚úÖ Source: {src_file}")
print(f"‚úÖ Target: {tgt_file}")

with open(src_file, "r", encoding="utf-8") as f:
    articles = [l.strip() for l in f.readlines()]

with open(tgt_file, "r", encoding="utf-8") as f:
    references = [l.strip() for l in f.readlines()]

min_len = min(len(articles), len(references))
articles = articles[:min_len][:LIMIT]
references = references[:min_len][:LIMIT]

print(f"üìö Loaded {len(articles)} MultiNews samples ({SPLIT})")

# =============================================================
# Summarization helper (UNCHANGED)
# =============================================================
def summarize_batch(texts):
    inputs = tokenizer(
        texts, max_length=1024, truncation=True, padding=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=130,
            min_length=30,
            num_beams=4,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del inputs, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs

# =============================================================
# Save progress locally (UNCHANGED)
# =============================================================
def save_progress(articles, refs, summaries, save_path):
    df = pd.DataFrame({
        "document": articles,
        "reference": refs,
        "summary": summaries
    })
    header = not os.path.exists(save_path)
    df.to_csv(save_path, mode="a", header=header, index=False)

# =============================================================
# Resume support (UNCHANGED)
# =============================================================
start_idx = 0
if os.path.exists(SAVE_PATH):
    df_prev = pd.read_csv(SAVE_PATH)
    start_idx = len(df_prev)
    print(f"üîÅ Resuming from index {start_idx}")

# =============================================================
# Main summarization loop (UNCHANGED)
# =============================================================
total_chunks = (len(articles) - start_idx + CHUNK_SIZE - 1) // CHUNK_SIZE
chunk_no = 1

for i in range(start_idx, len(articles), CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, len(articles))
    print(f"\nüöÄ Chunk {chunk_no}/{total_chunks} ‚Üí Samples {i}‚Äì{end}")

    batch_articles = articles[i:end]
    batch_refs = references[i:end]
    all_summaries = []

    total_batches = (len(batch_articles) + BATCH_SIZE - 1) // BATCH_SIZE

    for j in range(0, len(batch_articles), BATCH_SIZE):
        sub_batch = batch_articles[j:j + BATCH_SIZE]
        batch_no = (j // BATCH_SIZE) + 1

        try:
            batch_summaries = summarize_batch(sub_batch)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache(); gc.collect()
            batch_summaries = summarize_batch(sub_batch)

        all_summaries.extend(batch_summaries)
        sys.stdout.write(f"\r[Batch {batch_no}/{total_batches}]")
        sys.stdout.flush()

    print()
    save_progress(batch_articles, batch_refs, all_summaries, SAVE_PATH)
    print(f"üíæ Saved chunk {chunk_no}/{total_chunks} ({end}/{len(articles)} done)")

    chunk_no += 1
    torch.cuda.empty_cache(); gc.collect()

print("\nüéâ All MultiNews summaries complete!")
print("üìÑ Saved to:", SAVE_PATH)


‚úÖ Using device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üì¶ Extracting MultiNews ZIP...
‚úÖ Extraction complete!
‚úÖ Source: /content/data_multinews/multi_news/val.src.cleaned
‚úÖ Target: /content/data_multinews/multi_news/val.tgt
üìö Loaded 5622 MultiNews samples (val)

üöÄ Chunk 1/29 ‚Üí Samples 0‚Äì200
[Batch 5/20]

KeyboardInterrupt: 

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (SINGLE CELL)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "val"
LIMIT = 200
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i, sample in enumerate(articles):
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")
    torch.cuda.empty_cache(); gc.collect()

# ---------------- SAVE ----------------
pd.DataFrame(results).to_csv(SAVE_PATH, index=False)
print("Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úì 1/200 done
‚úì 2/200 done
‚úì 3/200 done
‚úì 4/200 done
‚úì 5/200 done
‚úì 6/200 done
‚úì 7/200 done
‚úì 8/200 done
‚úì 9/200 done
‚úì 10/200 done
‚úì 11/200 done
‚úì 12/200 done
‚úì 13/200 done
‚úì 14/200 done


KeyboardInterrupt: 

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "val"
LIMIT = 200
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 200
‚úÖ Summarization complete. Saved to: /content/pegasus_multidoc_val.csv


In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "val"
LIMIT = 400
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 200
‚úì 201/400 done
‚úì 202/400 done
‚úì 203/400 done
‚úì 204/400 done
‚úì 205/400 done
‚úì 206/400 done
‚úì 207/400 done
‚úì 208/400 done
‚úì 209/400 done
‚úì 210/400 done
‚úì 211/400 done
‚úì 212/400 done
‚úì 213/400 done
‚úì 214/400 done
‚úì 215/400 done
‚úì 216/400 done
‚úì 217/400 done
‚úì 218/400 done
‚úì 219/400 done
‚úì 220/400 done
‚úì 221/400 done
‚úì 222/400 done
‚úì 223/400 done
‚úì 224/400 done
‚úì 225/400 done
‚úì 226/400 done
‚úì 227/400 done
‚úì 228/400 done
‚úì 229/400 done
‚úì 230/400 done
‚úì 231/400 done
‚úì 232/400 done
‚úì 233/400 done
‚úì 234/400 done
‚úì 235/400 done
‚úì 236/400 done
‚úì 237/400 done
‚úì 238/400 done
‚úì 239/400 done
‚úì 240/400 done
‚úì 241/400 done
‚úì 242/400 done
‚úì 243/400 done
‚úì 244/400 done
‚úì 245/400 done
‚úì 246/400 done
‚úì 247/400 done
‚úì 248/400 done
‚úì 249/400 done
‚úì 250/400 done
‚úì 251/400 done
‚úì 252/400 done
‚úì 253/400 done
‚úì 254/400 done
‚úì 255/400 done
‚úì 256/400 done
‚úì 257/400 done
‚ú

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "val"
LIMIT = 800
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 400
‚úì 401/800 done
‚úì 402/800 done
‚úì 403/800 done
‚úì 404/800 done
‚úì 405/800 done
‚úì 406/800 done
‚úì 407/800 done
‚úì 408/800 done
‚úì 409/800 done
‚úì 410/800 done
‚úì 411/800 done
‚úì 412/800 done
‚úì 413/800 done
‚úì 414/800 done
‚úì 415/800 done
‚úì 416/800 done
‚úì 417/800 done
‚úì 418/800 done
‚úì 419/800 done
‚úì 420/800 done
‚úì 421/800 done
‚úì 422/800 done
‚úì 423/800 done
‚úì 424/800 done
‚úì 425/800 done
‚úì 426/800 done
‚úì 427/800 done
‚úì 428/800 done
‚úì 429/800 done
‚úì 430/800 done
‚úì 431/800 done
‚úì 432/800 done
‚úì 433/800 done
‚úì 434/800 done
‚úì 435/800 done
‚úì 436/800 done
‚úì 437/800 done
‚úì 438/800 done
‚úì 439/800 done
‚úì 440/800 done
‚úì 441/800 done
‚úì 442/800 done
‚úì 443/800 done
‚úì 444/800 done
‚úì 445/800 done
‚úì 446/800 done
‚úì 447/800 done
‚úì 448/800 done
‚úì 449/800 done
‚úì 450/800 done
‚úì 451/800 done
‚úì 452/800 done
‚úì 453/800 done
‚úì 454/800 done
‚úì 455/800 done
‚úì 456/800 done
‚úì 457/800 done
‚ú

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "val"
LIMIT = 900
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 800
‚úì 801/900 done
‚úì 802/900 done
‚úì 803/900 done
‚úì 804/900 done
‚úì 805/900 done
‚úì 806/900 done
‚úì 807/900 done
‚úì 808/900 done
‚úì 809/900 done
‚úì 810/900 done
‚úì 811/900 done
‚úì 812/900 done
‚úì 813/900 done
‚úì 814/900 done
‚úì 815/900 done
‚úì 816/900 done
‚úì 817/900 done
‚úì 818/900 done
‚úì 819/900 done
‚úì 820/900 done
‚úì 821/900 done
‚úì 822/900 done
‚úì 823/900 done
‚úì 824/900 done
‚úì 825/900 done
‚úì 826/900 done
‚úì 827/900 done
‚úì 828/900 done
‚úì 829/900 done
‚úì 830/900 done
‚úì 831/900 done
‚úì 832/900 done
‚úì 833/900 done
‚úì 834/900 done
‚úì 835/900 done
‚úì 836/900 done
‚úì 837/900 done
‚úì 838/900 done
‚úì 839/900 done
‚úì 840/900 done
‚úì 841/900 done
‚úì 842/900 done
‚úì 843/900 done
‚úì 844/900 done
‚úì 845/900 done
‚úì 846/900 done
‚úì 847/900 done
‚úì 848/900 done
‚úì 849/900 done
‚úì 850/900 done
‚úì 851/900 done
‚úì 852/900 done
‚úì 853/900 done
‚úì 854/900 done
‚úì 855/900 done
‚úì 856/900 done
‚úì 857/900 done
‚ú

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "val"
LIMIT = 1000
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 900
‚úì 901/1000 done
‚úì 902/1000 done
‚úì 903/1000 done
‚úì 904/1000 done
‚úì 905/1000 done
‚úì 906/1000 done
‚úì 907/1000 done
‚úì 908/1000 done
‚úì 909/1000 done
‚úì 910/1000 done
‚úì 911/1000 done
‚úì 912/1000 done
‚úì 913/1000 done
‚úì 914/1000 done
‚úì 915/1000 done
‚úì 916/1000 done
‚úì 917/1000 done
‚úì 918/1000 done
‚úì 919/1000 done
‚úì 920/1000 done
‚úì 921/1000 done
‚úì 922/1000 done
‚úì 923/1000 done
‚úì 924/1000 done
‚úì 925/1000 done
‚úì 926/1000 done
‚úì 927/1000 done
‚úì 928/1000 done
‚úì 929/1000 done
‚úì 930/1000 done
‚úì 931/1000 done
‚úì 932/1000 done
‚úì 933/1000 done
‚úì 934/1000 done
‚úì 935/1000 done
‚úì 936/1000 done
‚úì 937/1000 done
‚úì 938/1000 done
‚úì 939/1000 done
‚úì 940/1000 done
‚úì 941/1000 done
‚úì 942/1000 done
‚úì 943/1000 done
‚úì 944/1000 done
‚úì 945/1000 done
‚úì 946/1000 done
‚úì 947/1000 done
‚úì 948/1000 done
‚úì 949/1000 done
‚úì 950/1000 done
‚úì 951/1000 done
‚úì 952/1000 done
‚úì 953/1000 done
‚úì 954/1000 done

test

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "test"
LIMIT = 200
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úì 1/200 done
‚úì 2/200 done
‚úì 3/200 done
‚úì 4/200 done
‚úì 5/200 done
‚úì 6/200 done
‚úì 7/200 done
‚úì 8/200 done
‚úì 9/200 done
‚úì 10/200 done
‚úì 11/200 done
‚úì 12/200 done
‚úì 13/200 done
‚úì 14/200 done
‚úì 15/200 done
‚úì 16/200 done
‚úì 17/200 done
‚úì 18/200 done
‚úì 19/200 done
‚úì 20/200 done
‚úì 21/200 done
‚úì 22/200 done
‚úì 23/200 done
‚úì 24/200 done
‚úì 25/200 done
‚úì 26/200 done
‚úì 27/200 done
‚úì 28/200 done
‚úì 29/200 done
‚úì 30/200 done
‚úì 31/200 done
‚úì 32/200 done
‚úì 33/200 done
‚úì 34/200 done
‚úì 35/200 done
‚úì 36/200 done
‚úì 37/200 done
‚úì 38/200 done
‚úì 39/200 done
‚úì 40/200 done
‚úì 41/200 done
‚úì 42/200 done
‚úì 43/200 done
‚úì 44/200 done
‚úì 45/200 done
‚úì 46/200 done
‚úì 47/200 done
‚úì 48/200 done
‚úì 49/200 done
‚úì 50/200 done
‚úì 51/200 done
‚úì 52/200 done
‚úì 53/200 done
‚úì 54/200 done
‚úì 55/200 done
‚úì 56/200 done
‚úì 57/200 done
‚úì 58/200 done
‚úì 59/200 done
‚úì 60/200 done
‚úì 61/200 done
‚úì 62/200 done
‚úì 63/200 done
‚

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "test"
LIMIT = 400
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 200
‚úì 201/400 done
‚úì 202/400 done
‚úì 203/400 done
‚úì 204/400 done
‚úì 205/400 done
‚úì 206/400 done
‚úì 207/400 done
‚úì 208/400 done
‚úì 209/400 done
‚úì 210/400 done
‚úì 211/400 done
‚úì 212/400 done
‚úì 213/400 done
‚úì 214/400 done
‚úì 215/400 done
‚úì 216/400 done
‚úì 217/400 done
‚úì 218/400 done
‚úì 219/400 done
‚úì 220/400 done
‚úì 221/400 done
‚úì 222/400 done
‚úì 223/400 done
‚úì 224/400 done
‚úì 225/400 done
‚úì 226/400 done
‚úì 227/400 done
‚úì 228/400 done
‚úì 229/400 done
‚úì 230/400 done
‚úì 231/400 done
‚úì 232/400 done
‚úì 233/400 done
‚úì 234/400 done
‚úì 235/400 done
‚úì 236/400 done
‚úì 237/400 done
‚úì 238/400 done
‚úì 239/400 done
‚úì 240/400 done
‚úì 241/400 done
‚úì 242/400 done
‚úì 243/400 done
‚úì 244/400 done
‚úì 245/400 done
‚úì 246/400 done
‚úì 247/400 done
‚úì 248/400 done
‚úì 249/400 done
‚úì 250/400 done
‚úì 251/400 done
‚úì 252/400 done
‚úì 253/400 done
‚úì 254/400 done
‚úì 255/400 done
‚úì 256/400 done
‚úì 257/400 done
‚ú

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "test"
LIMIT = 600
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 400
‚úì 401/600 done
‚úì 402/600 done
‚úì 403/600 done
‚úì 404/600 done
‚úì 405/600 done
‚úì 406/600 done
‚úì 407/600 done
‚úì 408/600 done
‚úì 409/600 done
‚úì 410/600 done
‚úì 411/600 done
‚úì 412/600 done
‚úì 413/600 done
‚úì 414/600 done
‚úì 415/600 done
‚úì 416/600 done
‚úì 417/600 done
‚úì 418/600 done
‚úì 419/600 done
‚úì 420/600 done
‚úì 421/600 done
‚úì 422/600 done
‚úì 423/600 done
‚úì 424/600 done
‚úì 425/600 done
‚úì 426/600 done
‚úì 427/600 done
‚úì 428/600 done
‚úì 429/600 done
‚úì 430/600 done
‚úì 431/600 done
‚úì 432/600 done
‚úì 433/600 done
‚úì 434/600 done
‚úì 435/600 done
‚úì 436/600 done
‚úì 437/600 done
‚úì 438/600 done
‚úì 439/600 done
‚úì 440/600 done
‚úì 441/600 done
‚úì 442/600 done
‚úì 443/600 done
‚úì 444/600 done
‚úì 445/600 done
‚úì 446/600 done
‚úì 447/600 done
‚úì 448/600 done
‚úì 449/600 done
‚úì 450/600 done
‚úì 451/600 done
‚úì 452/600 done
‚úì 453/600 done
‚úì 454/600 done
‚úì 455/600 done
‚úì 456/600 done
‚úì 457/600 done
‚ú

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "test"
LIMIT = 800
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 600
‚úì 601/800 done
‚úì 602/800 done
‚úì 603/800 done
‚úì 604/800 done
‚úì 605/800 done
‚úì 606/800 done
‚úì 607/800 done
‚úì 608/800 done
‚úì 609/800 done
‚úì 610/800 done
‚úì 611/800 done
‚úì 612/800 done
‚úì 613/800 done
‚úì 614/800 done
‚úì 615/800 done
‚úì 616/800 done
‚úì 617/800 done
‚úì 618/800 done
‚úì 619/800 done
‚úì 620/800 done
‚úì 621/800 done
‚úì 622/800 done
‚úì 623/800 done
‚úì 624/800 done
‚úì 625/800 done
‚úì 626/800 done
‚úì 627/800 done
‚úì 628/800 done
‚úì 629/800 done
‚úì 630/800 done
‚úì 631/800 done
‚úì 632/800 done
‚úì 633/800 done
‚úì 634/800 done
‚úì 635/800 done
‚úì 636/800 done
‚úì 637/800 done
‚úì 638/800 done
‚úì 639/800 done
‚úì 640/800 done
‚úì 641/800 done
‚úì 642/800 done
‚úì 643/800 done
‚úì 644/800 done
‚úì 645/800 done
‚úì 646/800 done
‚úì 647/800 done
‚úì 648/800 done
‚úì 649/800 done
‚úì 650/800 done
‚úì 651/800 done
‚úì 652/800 done
‚úì 653/800 done
‚úì 654/800 done
‚úì 655/800 done
‚úì 656/800 done
‚úì 657/800 done
‚ú

In [None]:
# =============================================================
# TRUE MULTI-DOCUMENT SUMMARIZATION ‚Äî PEGASUS (RESUMABLE)
# Multi-News | Hierarchical: Doc-level ‚Üí Fusion
# =============================================================

!pip install transformers rouge-score accelerate --quiet

import torch, gc, os, pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# ---------------- CONFIG ----------------
MODEL_NAME = "google/pegasus-large"
DATA_DIR = "/content/data_multinews"
SPLIT = "test"
LIMIT = 1000
SAVE_PATH = f"/content/pegasus_multidoc_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- MODEL ----------------
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
model = PegasusForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
model.eval()

# ---------------- LOAD DATA ----------------
src_file, tgt_file = None, None
for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f:
            src_file = os.path.join(root, f)
        if f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

articles = open(src_file).read().splitlines()[:LIMIT]
references = open(tgt_file).read().splitlines()[:LIMIT]

# ---------------- RESUME SUPPORT ----------------
start_idx = 0
if os.path.exists(SAVE_PATH):
    prev_df = pd.read_csv(SAVE_PATH)
    start_idx = len(prev_df)
    print(f"üîÅ Resuming from index {start_idx}")

# ---------------- HELPERS ----------------
def split_docs(sample):
    return [d.strip() for d in sample.split("|||||") if d.strip()]

def summarize(texts, max_len, min_len):
    inputs = tokenizer(
        texts,
        max_length=1024,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        ids = model.generate(
            **inputs,
            max_length=max_len,
            min_length=min_len,
            num_beams=4
        )

    out = tokenizer.batch_decode(ids, skip_special_tokens=True)
    del inputs, ids
    torch.cuda.empty_cache(); gc.collect()
    return out

# ---------------- PIPELINE ----------------
results = []

for i in range(start_idx, len(articles)):
    sample = articles[i]
    docs = split_docs(sample)

    # Stage 1: document-level summaries
    doc_summaries = summarize(docs, max_len=256, min_len=60)

    # Stage 2: fusion summary
    fused_text = " ".join(doc_summaries)
    final_summary = summarize([fused_text], max_len=150, min_len=60)[0]

    results.append({
        "document": sample,
        "reference": references[i],
        "summary": final_summary
    })

    print(f"‚úì {i+1}/{len(articles)} done")

    # save every sample (safe for Colab crashes)
    pd.DataFrame(results).to_csv(
        SAVE_PATH,
        mode="a",
        header=not os.path.exists(SAVE_PATH),
        index=False
    )
    results.clear()

    torch.cuda.empty_cache(); gc.collect()

print("‚úÖ Summarization complete. Saved to:", SAVE_PATH)


Device: cuda


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Resuming from index 800
‚úì 801/1000 done
‚úì 802/1000 done
‚úì 803/1000 done
‚úì 804/1000 done
‚úì 805/1000 done
‚úì 806/1000 done
‚úì 807/1000 done
‚úì 808/1000 done
‚úì 809/1000 done
‚úì 810/1000 done
‚úì 811/1000 done
‚úì 812/1000 done
‚úì 813/1000 done
‚úì 814/1000 done
‚úì 815/1000 done
‚úì 816/1000 done
‚úì 817/1000 done
‚úì 818/1000 done
‚úì 819/1000 done
‚úì 820/1000 done
‚úì 821/1000 done
‚úì 822/1000 done
‚úì 823/1000 done
‚úì 824/1000 done
‚úì 825/1000 done
‚úì 826/1000 done
‚úì 827/1000 done
‚úì 828/1000 done
‚úì 829/1000 done
‚úì 830/1000 done
‚úì 831/1000 done
‚úì 832/1000 done
‚úì 833/1000 done
‚úì 834/1000 done
‚úì 835/1000 done
‚úì 836/1000 done
‚úì 837/1000 done
‚úì 838/1000 done
‚úì 839/1000 done
‚úì 840/1000 done
‚úì 841/1000 done
‚úì 842/1000 done
‚úì 843/1000 done
‚úì 844/1000 done
‚úì 845/1000 done
‚úì 846/1000 done
‚úì 847/1000 done
‚úì 848/1000 done
‚úì 849/1000 done
‚úì 850/1000 done
‚úì 851/1000 done
‚úì 852/1000 done
‚úì 853/1000 done
‚úì 854/1000 done