In [None]:
# =============================================================
# ✅ Load Official Multi-News Dataset (from Alex-Fabbri repo)
# =============================================================

!apt-get install git -y > /dev/null
!git clone https://github.com/Alex-Fabbri/Multi-News.git
!mkdir -p /content/multinews_data
!mv Multi-News/data/* /content/multinews_data/
!rm -rf Multi-News

import pandas as pd
from datasets import Dataset, DatasetDict

def load_split(src_path, tgt_path):
    """Load a Multi-News split from .src and .tgt text files"""
    with open(src_path, 'r', encoding='utf-8') as fsrc, open(tgt_path, 'r', encoding='utf-8') as ftgt:
        docs = [line.strip() for line in fsrc]
        sums = [line.strip() for line in ftgt]
    print(f"✅ Loaded {len(docs)} examples from {src_path}")
    return Dataset.from_pandas(pd.DataFrame({"document": docs, "summary": sums}))

# Build the dataset dict (train / validation / test)
dataset = DatasetDict({
    "train": load_split("/content/multinews_data/train.src", "/content/multinews_data/train.tgt"),
    "validation": load_split("/content/multinews_data/val.src", "/content/multinews_data/val.tgt"),
    "test": load_split("/content/multinews_data/test.src", "/content/multinews_data/test.tgt"),
})

print(dataset)


fatal: destination path 'Multi-News' already exists and is not an empty directory.
mv: cannot stat 'Multi-News/data/*': No such file or directory


FileNotFoundError: [Errno 2] No such file or directory: '/content/multinews_data/train.src'

In [None]:
# =============================================================
# LED (Longformer Encoder–Decoder) Summarization - Multi-News
# ✅ Uses working mirror (knkarthick/multi_news_parquet)
# ✅ Automatically falls back to CNN/DailyMail if offline
# ✅ Saves progress to Google Drive with resume support
# =============================================================

!pip install transformers datasets rouge-score accelerate --quiet

import torch, gc, os, pandas as pd, sys
from transformers import AutoTokenizer, LEDForConditionalGeneration
from datasets import load_dataset

# =============================================================
# Mount Google Drive
# =============================================================
from google.colab import drive
drive.mount('/content/drive')

# =============================================================
# Config
# =============================================================
MODEL_NAME = "allenai/led-large-16384"
BATCH_SIZE = 1
CHUNK_SIZE = 50
LIMIT = 300                      # process fewer for speed
SPLIT = "validation"
SAVE_PATH = "/content/drive/MyDrive/summaries_led_multinews_val.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

# =============================================================
# Load model + tokenizer
# =============================================================
torch.cuda.empty_cache(); gc.collect()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

try:
    model = LEDForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    ).to(device)
except RuntimeError:
    print("⚠️ GPU OOM → switching to CPU")
    model = LEDForConditionalGeneration.from_pretrained(MODEL_NAME).to("cpu")
    device = torch.device("cpu")

model.eval()

# =============================================================
# Load Multi-News (working mirror or fallback)
# =============================================================
print("📥 Loading Multi-News dataset (knkarthick/multi_news_parquet)...")

try:
    ds = load_dataset("knkarthick/multi_news_parquet")
    print("✅ Loaded Multi-News Parquet version.")
except Exception as e:
    print("⚠️ Could not load Multi-News. Using CNN/DailyMail fallback.")
    ds = load_dataset("cnn_dailymail", "3.0.0")

print(ds)

# Choose split safely
if SPLIT not in ds:
    SPLIT = "validation"

# Ensure correct field names
if "document" not in ds[SPLIT].column_names:
    if "article" in ds[SPLIT].column_names:
        articles = ds[SPLIT]["article"][:LIMIT]
        references = ds[SPLIT]["highlights"][:LIMIT]
    else:
        raise ValueError("Dataset does not contain expected text fields.")
else:
    articles = ds[SPLIT]["document"][:LIMIT]
    references = ds[SPLIT]["summary"][:LIMIT]

print(f"📚 Loaded {len(articles)} samples from split: {SPLIT}")

# =============================================================
# Summarization helper
# =============================================================
def summarize_batch(texts):
    inputs = tokenizer(
        texts,
        max_length=16384,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=512,
            min_length=80,
            num_beams=2,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del inputs, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs

# =============================================================
# Save progress to Drive
# =============================================================
def save_progress(articles, refs, summaries, save_path):
    df = pd.DataFrame({
        "document": articles,
        "reference": refs,
        "summary": summaries
    })
    header = not os.path.exists(save_path)
    df.to_csv(save_path, mode='a', header=header, index=False)

# =============================================================
# Resume support
# =============================================================
start_idx = 0
if os.path.exists(SAVE_PATH):
    df_prev = pd.read_csv(SAVE_PATH)
    start_idx = len(df_prev)
    print(f"🔁 Resuming from index {start_idx}")

# =============================================================
# Main summarization loop
# =============================================================
total_chunks = (len(articles) - start_idx + CHUNK_SIZE - 1) // CHUNK_SIZE
chunk_no = 1

for i in range(start_idx, len(articles), CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, len(articles))
    print(f"\n🚀 Chunk {chunk_no}/{total_chunks} → Samples {i}–{end}")

    batch_articles = articles[i:end]
    batch_refs = references[i:end]
    all_summaries = []

    total_batches = (len(batch_articles) + BATCH_SIZE - 1) // BATCH_SIZE

    for j in range(0, len(batch_articles), BATCH_SIZE):
        sub_batch = batch_articles[j:j + BATCH_SIZE]
        batch_no = (j // BATCH_SIZE) + 1

        try:
            batch_summaries = summarize_batch(sub_batch)
        except torch.cuda.OutOfMemoryError:
            print("⚠️ OOM, retrying after clearing cache...")
            torch.cuda.empty_cache(); gc.collect()
            batch_summaries = summarize_batch(sub_batch)

        all_summaries.extend(batch_summaries)
        sys.stdout.write(f"\r[Batch {batch_no}/{total_batches}]")
        sys.stdout.flush()

    print()
    save_progress(batch_articles, batch_refs, all_summaries, SAVE_PATH)
    print(f"💾 Saved chunk {chunk_no}/{total_chunks} ({end}/{len(articles)} done)")

    chunk_no += 1
    torch.cuda.empty_cache(); gc.collect()

print("\n🎉 All Multi-News summaries complete! Saved to:", SAVE_PATH)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Using device: cuda
📥 Loading Multi-News dataset (knkarthick/multi_news_parquet)...
⚠️ Could not load Multi-News. Using CNN/DailyMail fallback.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# =============================================================
# LED (Longformer Encoder–Decoder) Summarization - Custom Drive ZIP
# ✅ Loads val.src.cleaned and val.tgt from ZIP in Google Drive
# ✅ Works even if ZIP already extracted
# ✅ Saves summaries to Google Drive with resume support
# =============================================================

!pip install transformers datasets rouge-score accelerate --quiet

import torch, gc, os, pandas as pd, sys, zipfile, glob
from transformers import AutoTokenizer, LEDForConditionalGeneration

# =============================================================
# Mount Google Drive
# =============================================================
from google.colab import drive
drive.mount('/content/drive')

# =============================================================
# Config
# =============================================================
MODEL_NAME = "allenai/led-large-16384"
BATCH_SIZE = 1
CHUNK_SIZE = 50
LIMIT = 500                        # Process fewer for faster testing
SPLIT = "val"                      # 👈 Focus only on validation split
ZIP_PATH = "/content/drive/MyDrive/multi_news.zip"
EXTRACT_DIR = "/content/data_multinews"
SAVE_PATH = f"/content/drive/MyDrive/summaries_led_multinews_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

# =============================================================
# Load model + tokenizer
# =============================================================
torch.cuda.empty_cache(); gc.collect()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

try:
    model = LEDForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    ).to(device)
except RuntimeError:
    print("⚠️ GPU OOM → switching to CPU")
    model = LEDForConditionalGeneration.from_pretrained(MODEL_NAME).to("cpu")
    device = torch.device("cpu")

model.eval()

# =============================================================
# Load dataset from ZIP (val.src.cleaned + val.tgt)
# =============================================================
if not os.path.exists(EXTRACT_DIR):
    print("📦 Extracting dataset from Drive ZIP...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("✅ Extraction complete!")
else:
    print("♻️ Using already extracted data folder.")

# Find val.src.cleaned and val.tgt files
src_file, tgt_file = None, None
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f and f.endswith(".cleaned"):
            src_file = os.path.join(root, f)
        elif f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

if not src_file or not tgt_file:
    raise ValueError(f"❌ Could not find val.src.cleaned or val.tgt in extracted ZIP!")

print(f"✅ Found source file: {src_file}")
print(f"✅ Found target file: {tgt_file}")

# Load lines from files
with open(src_file, 'r', encoding='utf-8') as f:
    articles = [line.strip() for line in f.readlines()]

with open(tgt_file, 'r', encoding='utf-8') as f:
    references = [line.strip() for line in f.readlines()]

# Clip to same length + limit
min_len = min(len(articles), len(references))
articles = articles[:min_len][:LIMIT]
references = references[:min_len][:LIMIT]

print(f"📚 Loaded {len(articles)} validation samples.")

# =============================================================
# Summarization helper
# =============================================================
def summarize_batch(texts):
    inputs = tokenizer(
        texts,
        max_length=16384,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=512,
            min_length=80,
            num_beams=2,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del inputs, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs

# =============================================================
# Save progress
# =============================================================
def save_progress(articles, refs, summaries, save_path):
    df_out = pd.DataFrame({
        "document": articles,
        "reference": refs,
        "summary": summaries
    })
    header = not os.path.exists(save_path)
    df_out.to_csv(save_path, mode='a', header=header, index=False)

# =============================================================
# Resume support
# =============================================================
start_idx = 0
if os.path.exists(SAVE_PATH):
    df_prev = pd.read_csv(SAVE_PATH)
    start_idx = len(df_prev)
    print(f"🔁 Resuming from index {start_idx}")

# =============================================================
# Main summarization loop
# =============================================================
total_chunks = (len(articles) - start_idx + CHUNK_SIZE - 1) // CHUNK_SIZE
chunk_no = 1

for i in range(start_idx, len(articles), CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, len(articles))
    print(f"\n🚀 Chunk {chunk_no}/{total_chunks} → Samples {i}–{end}")

    batch_articles = articles[i:end]
    batch_refs = references[i:end]
    all_summaries = []

    total_batches = (len(batch_articles) + BATCH_SIZE - 1) // BATCH_SIZE

    for j in range(0, len(batch_articles), BATCH_SIZE):
        sub_batch = batch_articles[j:j + BATCH_SIZE]
        batch_no = (j // BATCH_SIZE) + 1

        try:
            batch_summaries = summarize_batch(sub_batch)
        except torch.cuda.OutOfMemoryError:
            print("⚠️ OOM, retrying after clearing cache...")
            torch.cuda.empty_cache(); gc.collect()
            batch_summaries = summarize_batch(sub_batch)

        all_summaries.extend(batch_summaries)
        sys.stdout.write(f"\r[Batch {batch_no}/{total_batches}]")
        sys.stdout.flush()

    print()
    save_progress(batch_articles, batch_refs, all_summaries, SAVE_PATH)
    print(f"💾 Saved chunk {chunk_no}/{total_chunks} ({end}/{len(articles)} done)")

    chunk_no += 1
    torch.cuda.empty_cache(); gc.collect()

print(f"\n🎉 Validation summarization complete! Saved to: {SAVE_PATH}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Using device: cuda
♻️ Using already extracted data folder.
✅ Found source file: /content/data_multinews/multi_news/val.src.cleaned
✅ Found target file: /content/data_multinews/multi_news/val.tgt
📚 Loaded 500 validation samples.

🚀 Chunk 1/10 → Samples 0–50
[Batch 3/50]

Input ids are automatically padded from 980 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 8479 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 3860 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1791 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 3522 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 607 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 3194 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1657 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 2923 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 1752 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 2607 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 3161 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 2426 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 5275 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1208 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1277 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 2286 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1142 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1783 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 3319 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 1213 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 625 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 499 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 3854 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 3403 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 2170 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 5830 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 1534 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 2635 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 1814 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 2510 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 3455 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1412 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 598 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 3444 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2447 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 2630 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 3063 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 2085 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2306 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 1850 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 3942 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 442 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 3267 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 3851 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 1/10 (50/500 done)


Input ids are automatically padded from 9930 to 10240 to be a multiple of `config.attention_window`: 1024



🚀 Chunk 2/10 → Samples 50–100
[Batch 1/50]

Input ids are automatically padded from 337 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 3381 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 449 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 1608 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 3201 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 5362 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 254 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 1563 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1964 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 2561 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 1975 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 1567 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 3739 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 414 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 6157 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 2313 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 2409 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 3438 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 2857 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1476 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1241 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 957 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 2883 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 2888 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 159 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 1109 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 948 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 3661 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 1873 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 796 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 3327 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 334 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 3205 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 1859 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1561 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 2658 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 2131 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 3511 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 380 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 1658 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 4329 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 751 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 1071 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 5293 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2235 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 714 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 2/10 (100/500 done)

🚀 Chunk 3/10 → Samples 100–150


Input ids are automatically padded from 2090 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 1/50]

Input ids are automatically padded from 2621 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1372 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 1683 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 2580 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 4567 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1327 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 3016 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 3394 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 8085 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1545 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 3315 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 1290 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 1709 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 2406 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 3530 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1597 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 4230 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 906 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 2396 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1763 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 2188 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 3575 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 5800 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 2614 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 2698 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 2427 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 881 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 1913 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 4337 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 1318 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 4425 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 646 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 3043 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1009 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 2231 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 2041 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 3543 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 1616 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 1721 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 1282 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 1620 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 2599 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 911 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 2755 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2647 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 5627 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 4110 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 3/10 (150/500 done)

🚀 Chunk 4/10 → Samples 150–200


Input ids are automatically padded from 7305 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 1/50]

Input ids are automatically padded from 2551 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 3095 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 563 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 991 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 2301 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1851 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 4688 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 2587 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1926 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 977 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 341 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 1021 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 1040 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 1519 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 2172 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 5766 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1765 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 3990 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 2530 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1159 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 990 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 1445 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 2279 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 9442 to 10240 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 1451 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 4020 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 576 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 2716 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 3086 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 10067 to 10240 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 2577 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 959 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 2800 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 1056 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 7585 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 820 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1533 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 2455 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2225 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 5431 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 3798 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 478 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2636 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 2436 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 1272 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 748 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 4501 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 3653 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 4356 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 4/10 (200/500 done)


Input ids are automatically padded from 2988 to 3072 to be a multiple of `config.attention_window`: 1024



🚀 Chunk 5/10 → Samples 200–250
[Batch 1/50]

Input ids are automatically padded from 1430 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 3863 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 298 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 2114 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 3234 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 3326 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 8510 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 5092 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1360 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 2174 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 2653 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 292 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 3322 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 8543 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 1694 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 1017 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1133 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 5849 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 942 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 2618 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1235 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 3368 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 1733 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 2550 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 1067 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 473 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 1960 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 2456 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 3747 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 3372 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 1941 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 689 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 3410 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 1872 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 5784 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1695 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 1542 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 1892 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 3841 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 854 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 4148 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 1333 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 314 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 1653 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 3249 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2370 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 4675 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 4631 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 5/10 (250/500 done)

🚀 Chunk 6/10 → Samples 250–300


Input ids are automatically padded from 1029 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 1/50]

Input ids are automatically padded from 994 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1000 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 1330 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 4667 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1202 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 946 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 666 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 2262 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1633 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 4633 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 1197 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 551 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 7872 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 1037 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 1907 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1971 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 2531 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 1446 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 2340 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1267 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 3441 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 1415 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 2342 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 1423 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 1828 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 2044 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 2248 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 1080 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 1686 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 4655 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 662 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 467 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 6210 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 11018 to 11264 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 2735 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 683 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 754 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2134 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 3493 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 4347 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 3024 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2028 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 1908 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 1015 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 2743 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 3679 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 3317 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 6/10 (300/500 done)

🚀 Chunk 7/10 → Samples 300–350


Input ids are automatically padded from 1912 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 3520 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 962 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 565 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1450 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 10332 to 11264 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 192 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1636 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1804 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 2674 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 7625 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 6800 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 1833 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 3064 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 10967 to 11264 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 5356 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 15412 to 16384 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1477 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1991 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 4442 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 2651 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 3508 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 1697 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 1003 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 7907 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 1762 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 543 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 7756 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 3809 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 4746 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 1581 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 3042 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1688 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 1097 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2226 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 3189 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 4825 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2958 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 1897 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 1334 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2596 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 1240 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 7/10 (350/500 done)

🚀 Chunk 8/10 → Samples 350–400


Input ids are automatically padded from 4714 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 1/50]

Input ids are automatically padded from 5655 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 6475 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 1245 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 2401 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 4069 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1232 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 1989 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 7835 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1753 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 2021 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 1521 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 2336 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 2654 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 1641 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 2589 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 2938 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 909 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 747 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 3799 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 9320 to 10240 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 6538 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 4841 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 883 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 153 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 2705 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 3525 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 3310 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 5691 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 1900 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 2442 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 853 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 4029 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1959 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 3719 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 3008 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2380 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 2108 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 2490 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 1594 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2154 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 3151 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 2783 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 8183 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 898 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 1890 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 8/10 (400/500 done)

🚀 Chunk 9/10 → Samples 400–450
[Batch 1/50]

Input ids are automatically padded from 1602 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 2379 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 783 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 1106 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 234 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1614 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 1249 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 1255 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 7257 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 445 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 688 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 6033 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 1855 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 2009 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 949 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1716 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1257 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 3796 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 2237 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 7510 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 1365 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 3868 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 2112 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 514 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 2862 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 3407 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 1849 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 2412 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 917 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 3676 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 2851 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 2464 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 971 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 844 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 1089 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 1483 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 1028 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 8628 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 3209 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 3752 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 4218 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 1179 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2017 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 4272 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 2166 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 9/10 (450/500 done)


Input ids are automatically padded from 453 to 1024 to be a multiple of `config.attention_window`: 1024



🚀 Chunk 10/10 → Samples 450–500
[Batch 1/50]

Input ids are automatically padded from 1824 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1731 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 4255 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 1341 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 1963 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 3244 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 1777 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 1755 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 740 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1819 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 8247 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 458 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 2914 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 465 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 2444 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 2518 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1487 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1247 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 4241 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1595 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 2280 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 501 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 903 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 901 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 1313 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 1210 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 4449 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 2661 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 590 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 1556 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 2460 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 10382 to 11264 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 870 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 2061 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 3558 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 2478 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 2233 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 1834 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2024 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 1969 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 1784 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 1618 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 7727 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 5984 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 1203 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 10/10 (500/500 done)

🎉 Validation summarization complete! Saved to: /content/drive/MyDrive/summaries_led_multinews_val.csv


In [None]:
# =============================================================
# LED (Longformer Encoder–Decoder) Summarization - Custom Drive ZIP
# ✅ Loads val.src.cleaned and val.tgt from ZIP in Google Drive
# ✅ Works even if ZIP already extracted
# ✅ Saves summaries to Google Drive with resume support
# =============================================================

!pip install transformers datasets rouge-score accelerate --quiet

import torch, gc, os, pandas as pd, sys, zipfile, glob
from transformers import AutoTokenizer, LEDForConditionalGeneration

# =============================================================
# Mount Google Drive
# =============================================================
from google.colab import drive
drive.mount('/content/drive')

# =============================================================
# Config
# =============================================================
MODEL_NAME = "allenai/led-large-16384"
BATCH_SIZE = 1
CHUNK_SIZE = 50
LIMIT = 1000                        # Process fewer for faster testing
SPLIT = "val"                      # 👈 Focus only on validation split
ZIP_PATH = "/content/drive/MyDrive/multi_news.zip"
EXTRACT_DIR = "/content/data_multinews"
SAVE_PATH = f"/content/drive/MyDrive/summaries_led_multinews_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

# =============================================================
# Load model + tokenizer
# =============================================================
torch.cuda.empty_cache(); gc.collect()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

try:
    model = LEDForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    ).to(device)
except RuntimeError:
    print("⚠️ GPU OOM → switching to CPU")
    model = LEDForConditionalGeneration.from_pretrained(MODEL_NAME).to("cpu")
    device = torch.device("cpu")

model.eval()

# =============================================================
# Load dataset from ZIP (val.src.cleaned + val.tgt)
# =============================================================
if not os.path.exists(EXTRACT_DIR):
    print("📦 Extracting dataset from Drive ZIP...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("✅ Extraction complete!")
else:
    print("♻️ Using already extracted data folder.")

# Find val.src.cleaned and val.tgt files
src_file, tgt_file = None, None
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f and f.endswith(".cleaned"):
            src_file = os.path.join(root, f)
        elif f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

if not src_file or not tgt_file:
    raise ValueError(f"❌ Could not find val.src.cleaned or val.tgt in extracted ZIP!")

print(f"✅ Found source file: {src_file}")
print(f"✅ Found target file: {tgt_file}")

# Load lines from files
with open(src_file, 'r', encoding='utf-8') as f:
    articles = [line.strip() for line in f.readlines()]

with open(tgt_file, 'r', encoding='utf-8') as f:
    references = [line.strip() for line in f.readlines()]

# Clip to same length + limit
min_len = min(len(articles), len(references))
articles = articles[:min_len][:LIMIT]
references = references[:min_len][:LIMIT]

print(f"📚 Loaded {len(articles)} validation samples.")

# =============================================================
# Summarization helper
# =============================================================
def summarize_batch(texts):
    inputs = tokenizer(
        texts,
        max_length=16384,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=512,
            min_length=80,
            num_beams=2,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del inputs, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs

# =============================================================
# Save progress
# =============================================================
def save_progress(articles, refs, summaries, save_path):
    df_out = pd.DataFrame({
        "document": articles,
        "reference": refs,
        "summary": summaries
    })
    header = not os.path.exists(save_path)
    df_out.to_csv(save_path, mode='a', header=header, index=False)

# =============================================================
# Resume support
# =============================================================
start_idx = 0
if os.path.exists(SAVE_PATH):
    df_prev = pd.read_csv(SAVE_PATH)
    start_idx = len(df_prev)
    print(f"🔁 Resuming from index {start_idx}")

# =============================================================
# Main summarization loop
# =============================================================
total_chunks = (len(articles) - start_idx + CHUNK_SIZE - 1) // CHUNK_SIZE
chunk_no = 1

for i in range(start_idx, len(articles), CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, len(articles))
    print(f"\n🚀 Chunk {chunk_no}/{total_chunks} → Samples {i}–{end}")

    batch_articles = articles[i:end]
    batch_refs = references[i:end]
    all_summaries = []

    total_batches = (len(batch_articles) + BATCH_SIZE - 1) // BATCH_SIZE

    for j in range(0, len(batch_articles), BATCH_SIZE):
        sub_batch = batch_articles[j:j + BATCH_SIZE]
        batch_no = (j // BATCH_SIZE) + 1

        try:
            batch_summaries = summarize_batch(sub_batch)
        except torch.cuda.OutOfMemoryError:
            print("⚠️ OOM, retrying after clearing cache...")
            torch.cuda.empty_cache(); gc.collect()
            batch_summaries = summarize_batch(sub_batch)

        all_summaries.extend(batch_summaries)
        sys.stdout.write(f"\r[Batch {batch_no}/{total_batches}]")
        sys.stdout.flush()

    print()
    save_progress(batch_articles, batch_refs, all_summaries, SAVE_PATH)
    print(f"💾 Saved chunk {chunk_no}/{total_chunks} ({end}/{len(articles)} done)")

    chunk_no += 1
    torch.cuda.empty_cache(); gc.collect()

print(f"\n🎉 Validation summarization complete! Saved to: {SAVE_PATH}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Mounted at /content/drive
✅ Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

📦 Extracting dataset from Drive ZIP...
✅ Extraction complete!
✅ Found source file: /content/data_multinews/multi_news/val.src.cleaned
✅ Found target file: /content/data_multinews/multi_news/val.tgt
📚 Loaded 1000 validation samples.


Input ids are automatically padded from 813 to 1024 to be a multiple of `config.attention_window`: 1024


🔁 Resuming from index 500

🚀 Chunk 1/10 → Samples 500–550
[Batch 1/50]

Input ids are automatically padded from 13415 to 14336 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 4648 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 874 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 2677 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 2750 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 6051 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 2745 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 3392 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 5907 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1463 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 8000 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 3539 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 2041 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 2032 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 1716 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 676 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 6338 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1499 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 2346 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 588 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 3389 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 859 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 2653 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 1442 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 5785 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 3418 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 892 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 2727 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 9141 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 2237 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 2758 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 624 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 494 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1805 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 2133 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 73 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 679 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 1416 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 3069 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 2226 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2917 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 3853 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 7211 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 1027 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2639 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 3500 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 1497 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 1/10 (550/1000 done)


Input ids are automatically padded from 2357 to 3072 to be a multiple of `config.attention_window`: 1024



🚀 Chunk 2/10 → Samples 550–600
[Batch 1/50]

Input ids are automatically padded from 541 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1414 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 1677 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 764 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 11032 to 11264 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 4538 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 1866 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 2794 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 8847 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 8493 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 2308 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 2025 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 1008 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 4514 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 9018 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 1425 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1840 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 2108 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 1237 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1605 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1961 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 783 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 2250 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 628 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 2555 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 2413 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 2154 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 1939 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 3694 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 5505 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 14000 to 14336 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 2207 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 2561 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 4302 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 641 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1432 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 1245 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 4403 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 7124 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 3119 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 2372 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 4237 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 3158 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 3373 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 841 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 1814 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 1908 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 3464 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 2/10 (600/1000 done)


Input ids are automatically padded from 816 to 1024 to be a multiple of `config.attention_window`: 1024



🚀 Chunk 3/10 → Samples 600–650
[Batch 1/50]

Input ids are automatically padded from 4882 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 698 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 12507 to 13312 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 2541 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 4960 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 2587 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 1148 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 1802 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 261 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 547 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 1533 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 2082 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 6018 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 1839 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 594 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 3221 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1828 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 1590 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1308 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 378 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 2848 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 4099 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 2634 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 3554 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 3928 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 8177 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 2466 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 2139 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 2046 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 1143 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 2748 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 1287 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 976 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 3937 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1277 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 1598 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 1734 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 1693 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 1473 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 1709 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 1172 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 1396 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 3183 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 1829 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 1609 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 2505 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 6622 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 3/10 (650/1000 done)


Input ids are automatically padded from 3777 to 4096 to be a multiple of `config.attention_window`: 1024



🚀 Chunk 4/10 → Samples 650–700
[Batch 1/50]

Input ids are automatically padded from 987 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 7161 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 1002 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 1812 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 1790 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 1324 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 2914 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 3014 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1371 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 3148 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 1856 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 11319 to 12288 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 2255 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 1881 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 2069 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 2197 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1295 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1591 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 697 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 3171 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 3295 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 9894 to 10240 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 1522 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 1923 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 3575 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 881 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 481 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 5438 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 397 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 2331 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 635 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 1712 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 1935 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 1464 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 574 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 2071 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1279 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 1201 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2908 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 836 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 4321 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 4254 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2586 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 3871 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 1981 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 1256 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 3146 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 2675 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 4/10 (700/1000 done)


Input ids are automatically padded from 1588 to 2048 to be a multiple of `config.attention_window`: 1024



🚀 Chunk 5/10 → Samples 700–750
[Batch 1/50]

Input ids are automatically padded from 688 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1475 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 4224 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 2472 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 1179 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 935 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 5303 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 924 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 893 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1144 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 5103 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 444 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 2136 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 5808 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 294 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 2285 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 2890 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 4879 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 1470 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 1913 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 406 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 3498 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 4365 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 456 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 3003 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 2926 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 5056 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 324 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 2532 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 1731 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 2790 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 6788 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 503 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 318 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 5032 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 1589 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 1964 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 6584 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 1631 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 312 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 1327 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 3756 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 3331 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 1375 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 6612 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 5/10 (750/1000 done)

🚀 Chunk 6/10 → Samples 750–800

Input ids are automatically padded from 860 to 1024 to be a multiple of `config.attention_window`: 1024



[Batch 1/50]

Input ids are automatically padded from 5996 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1067 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 2297 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 4170 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 2235 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 2674 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 1710 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 4176 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 984 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1377 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 1535 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 1443 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 2583 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 2480 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 888 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 868 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1825 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 3273 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 1487 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1160 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 4003 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 2324 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 1810 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 3123 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 2292 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 1210 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 1938 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 4740 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 1047 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 1236 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 8494 to 9216 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 396 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1772 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1304 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 1214 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2965 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 7570 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 2808 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 4964 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2458 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 5021 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 11952 to 12288 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 1860 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 1843 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 1101 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 2509 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 6/10 (800/1000 done)

🚀 Chunk 7/10 → Samples 800–850


Input ids are automatically padded from 5398 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 1/50]

Input ids are automatically padded from 1893 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1347 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 3460 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 524 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 10004 to 10240 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 5524 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 814 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 326 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1259 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1476 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 741 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 4384 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 700 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 7106 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 3366 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 715 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 1225 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 10795 to 11264 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 236 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 3311 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 5210 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 1253 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 6555 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 911 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 15301 to 15360 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 1269 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 4638 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 1446 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 1054 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 3247 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 463 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 1775 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 461 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1630 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 972 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 476 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2607 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 1202 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 2706 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 875 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 2482 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 2192 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 3882 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 595 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2217 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 11828 to 12288 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 7/10 (850/1000 done)

🚀 Chunk 8/10 → Samples 850–900


Input ids are automatically padded from 3109 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 1/50]

Input ids are automatically padded from 2368 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 3621 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 398 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 2585 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 744 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 3806 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 2296 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 1626 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1231 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 673 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 209 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 4166 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 9393 to 10240 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 1448 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 3601 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 5082 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1003 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 1156 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 401 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 655 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 2042 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 1400 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 2919 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 2877 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 1907 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 1137 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 14511 to 15360 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 865 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 652 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 7274 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 3417 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 8183 to 8192 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 2700 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1713 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1997 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 6596 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 899 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 2425 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 4000 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 1951 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 6073 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 4235 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 955 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 1826 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 1319 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 625 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 8/10 (900/1000 done)

🚀 Chunk 9/10 → Samples 900–950


Input ids are automatically padded from 3997 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1831 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 2291 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 1539 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 6/50]

Input ids are automatically padded from 2288 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 2366 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 3824 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 3474 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1684 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 1682 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 1176 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 3966 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 371 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 3032 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 1496 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 5126 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 3218 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1845 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 22/50]

Input ids are automatically padded from 1426 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 1275 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 685 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 26/50]

Input ids are automatically padded from 6617 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 3444 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 100 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 1536 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 3755 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 3631 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 677 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 525 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 350 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1419 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 2116 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 2575 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 3112 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 743 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 3252 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 3434 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 10810 to 11264 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 832 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2003 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 1369 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 2120 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 9/10 (950/1000 done)

🚀 Chunk 10/10 → Samples 950–1000


Input ids are automatically padded from 13126 to 13312 to be a multiple of `config.attention_window`: 1024


[Batch 1/50]

Input ids are automatically padded from 646 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 2/50]

Input ids are automatically padded from 1112 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 3/50]

Input ids are automatically padded from 3551 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 4/50]

Input ids are automatically padded from 569 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 5/50]

Input ids are automatically padded from 4007 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 7/50]

Input ids are automatically padded from 4946 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 8/50]

Input ids are automatically padded from 3233 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 9/50]

Input ids are automatically padded from 404 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 10/50]

Input ids are automatically padded from 1136 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 11/50]

Input ids are automatically padded from 1779 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 12/50]

Input ids are automatically padded from 257 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 13/50]

Input ids are automatically padded from 2809 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 14/50]

Input ids are automatically padded from 2650 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 15/50]

Input ids are automatically padded from 1569 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 16/50]

Input ids are automatically padded from 3484 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 17/50]

Input ids are automatically padded from 787 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 18/50]

Input ids are automatically padded from 867 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 19/50]

Input ids are automatically padded from 2898 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 20/50]

Input ids are automatically padded from 666 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 21/50]

Input ids are automatically padded from 1488 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 23/50]

Input ids are automatically padded from 6312 to 7168 to be a multiple of `config.attention_window`: 1024


[Batch 24/50]

Input ids are automatically padded from 952 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 25/50]

Input ids are automatically padded from 5474 to 6144 to be a multiple of `config.attention_window`: 1024


[Batch 27/50]

Input ids are automatically padded from 1740 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 28/50]

Input ids are automatically padded from 1681 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 29/50]

Input ids are automatically padded from 2306 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 30/50]

Input ids are automatically padded from 1967 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 31/50]

Input ids are automatically padded from 3408 to 4096 to be a multiple of `config.attention_window`: 1024


[Batch 32/50]

Input ids are automatically padded from 1517 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 33/50]

Input ids are automatically padded from 1696 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 34/50]

Input ids are automatically padded from 1337 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 35/50]

Input ids are automatically padded from 2948 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 36/50]

Input ids are automatically padded from 1543 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 37/50]

Input ids are automatically padded from 1223 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 38/50]

Input ids are automatically padded from 1506 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 39/50]

Input ids are automatically padded from 1722 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 40/50]

Input ids are automatically padded from 2485 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 41/50]

Input ids are automatically padded from 2499 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 42/50]

Input ids are automatically padded from 2337 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 43/50]

Input ids are automatically padded from 855 to 1024 to be a multiple of `config.attention_window`: 1024


[Batch 44/50]

Input ids are automatically padded from 4950 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 45/50]

Input ids are automatically padded from 1465 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 46/50]

Input ids are automatically padded from 2656 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 47/50]

Input ids are automatically padded from 2602 to 3072 to be a multiple of `config.attention_window`: 1024


[Batch 48/50]

Input ids are automatically padded from 4486 to 5120 to be a multiple of `config.attention_window`: 1024


[Batch 49/50]

Input ids are automatically padded from 1034 to 2048 to be a multiple of `config.attention_window`: 1024


[Batch 50/50]
💾 Saved chunk 10/10 (1000/1000 done)

🎉 Validation summarization complete! Saved to: /content/drive/MyDrive/summaries_led_multinews_val.csv


In [None]:
# =============================================================
# LED (Longformer Encoder–Decoder) Summarization - Custom Drive ZIP
# ✅ Loads test.src.cleaned and test.tgt from ZIP in Google Drive
# ✅ Works even if ZIP already extracted
# ✅ Inline progress updates (same-line batch display)
# ✅ Saves summaries to Google Drive with resume support
# =============================================================

!pip install transformers datasets rouge-score accelerate --quiet

import torch, gc, os, pandas as pd, sys, zipfile
from transformers import AutoTokenizer, LEDForConditionalGeneration
import warnings, transformers

# =============================================================
# Mount Google Drive
# =============================================================
from google.colab import drive
drive.mount('/content/drive')

# =============================================================
# Config
# =============================================================
MODEL_NAME = "allenai/led-large-16384"
BATCH_SIZE = 1
CHUNK_SIZE = 50
LIMIT = 500                        # Adjust for speed / testing
SPLIT = "test"                     # 👈 Focus only on TEST split
ZIP_PATH = "/content/drive/MyDrive/multi_news.zip"
EXTRACT_DIR = "/content/data_multinews"
SAVE_PATH = f"/content/drive/MyDrive/summaries_led_multinews_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

# =============================================================
# Load model + tokenizer
# =============================================================
torch.cuda.empty_cache(); gc.collect()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

try:
    model = LEDForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    ).to(device)
except RuntimeError:
    print("⚠️ GPU OOM → switching to CPU")
    model = LEDForConditionalGeneration.from_pretrained(MODEL_NAME).to("cpu")
    device = torch.device("cpu")

model.eval()

# =============================================================
# Load dataset from ZIP (test.src.cleaned + test.tgt)
# =============================================================
if not os.path.exists(EXTRACT_DIR):
    print("📦 Extracting dataset from Drive ZIP...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("✅ Extraction complete!")
else:
    print("♻️ Using already extracted data folder.")

# Find test.src.cleaned and test.tgt files
src_file, tgt_file = None, None
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f and f.endswith(".cleaned"):
            src_file = os.path.join(root, f)
        elif f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

if not src_file or not tgt_file:
    raise ValueError(f"❌ Could not find {SPLIT}.src.cleaned or {SPLIT}.tgt in extracted ZIP!")

print(f"✅ Found source file: {src_file}")
print(f"✅ Found target file: {tgt_file}")

# Load lines from files
with open(src_file, 'r', encoding='utf-8') as f:
    articles = [line.strip() for line in f.readlines()]

with open(tgt_file, 'r', encoding='utf-8') as f:
    references = [line.strip() for line in f.readlines()]

# Clip to same length + limit
min_len = min(len(articles), len(references))
articles = articles[:min_len][:LIMIT]
references = references[:min_len][:LIMIT]

print(f"📚 Loaded {len(articles)} test samples.")

# =============================================================
# Summarization helper
# =============================================================
def summarize_batch(texts):
    inputs = tokenizer(
        texts,
        max_length=16384,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=512,
            min_length=80,
            num_beams=2,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del inputs, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs

# =============================================================
# Save progress
# =============================================================
def save_progress(articles, refs, summaries, save_path):
    df_out = pd.DataFrame({
        "document": articles,
        "reference": refs,
        "summary": summaries
    })
    header = not os.path.exists(save_path)
    df_out.to_csv(save_path, mode='a', header=header, index=False)

# =============================================================
# Resume support
# =============================================================
start_idx = 0
if os.path.exists(SAVE_PATH):
    df_prev = pd.read_csv(SAVE_PATH)
    start_idx = len(df_prev)
    print(f"🔁 Resuming from index {start_idx}")

# =============================================================
# Clean up noisy LED padding warnings
# =============================================================
warnings.filterwarnings("ignore", message="Input ids are automatically padded")
transformers.logging.set_verbosity_error()

# =============================================================
# Main summarization loop (dynamic same-line batch updates)
# =============================================================
total_chunks = (len(articles) - start_idx + CHUNK_SIZE - 1) // CHUNK_SIZE
chunk_no = 1

for i in range(start_idx, len(articles), CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, len(articles))
    print(f"\n🚀 Chunk {chunk_no}/{total_chunks} → Samples {i}–{end}")

    batch_articles = articles[i:end]
    batch_refs = references[i:end]
    all_summaries = []

    total_batches = (len(batch_articles) + BATCH_SIZE - 1) // BATCH_SIZE

    for j in range(0, len(batch_articles), BATCH_SIZE):
        sub_batch = batch_articles[j:j + BATCH_SIZE]
        batch_no = (j // BATCH_SIZE) + 1

        try:
            batch_summaries = summarize_batch(sub_batch)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache(); gc.collect()
            batch_summaries = summarize_batch(sub_batch)

        all_summaries.extend(batch_summaries)

        # ✅ overwrite the same line (no newlines for each batch)
        sys.stdout.write(f"\r[Chunk {chunk_no}/{total_chunks}] [Batch {batch_no}/{total_batches}]")
        sys.stdout.flush()

    # move to new line after chunk finishes
    print()
    save_progress(batch_articles, batch_refs, all_summaries, SAVE_PATH)
    print(f"💾 Saved chunk {chunk_no}/{total_chunks} ({end}/{len(articles)} done)")

    chunk_no += 1
    torch.cuda.empty_cache(); gc.collect()

print(f"\n🎉 Test summarization complete! Saved to: {SAVE_PATH}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Mounted at /content/drive
✅ Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

📦 Extracting dataset from Drive ZIP...
✅ Extraction complete!
✅ Found source file: /content/data_multinews/multi_news/test.src.cleaned
✅ Found target file: /content/data_multinews/multi_news/test.tgt
📚 Loaded 500 test samples.

🚀 Chunk 1/10 → Samples 0–50
[Chunk 1/10] [Batch 50/50]
💾 Saved chunk 1/10 (50/500 done)

🚀 Chunk 2/10 → Samples 50–100
[Chunk 2/10] [Batch 50/50]
💾 Saved chunk 2/10 (100/500 done)

🚀 Chunk 3/10 → Samples 100–150
[Chunk 3/10] [Batch 50/50]
💾 Saved chunk 3/10 (150/500 done)

🚀 Chunk 4/10 → Samples 150–200
[Chunk 4/10] [Batch 50/50]
💾 Saved chunk 4/10 (200/500 done)

🚀 Chunk 5/10 → Samples 200–250
[Chunk 5/10] [Batch 50/50]
💾 Saved chunk 5/10 (250/500 done)

🚀 Chunk 6/10 → Samples 250–300
[Chunk 6/10] [Batch 50/50]
💾 Saved chunk 6/10 (300/500 done)

🚀 Chunk 7/10 → Samples 300–350
[Chunk 7/10] [Batch 50/50]
💾 Saved chunk 7/10 (350/500 done)

🚀 Chunk 8/10 → Samples 350–400
[Chunk 8/10] [Batch 50/50]
💾 Saved chunk 8/10 (400/500 done)

🚀 Chunk 9/10 → Samples 400–450
[C

In [None]:
# =============================================================
# LED (Longformer Encoder–Decoder) Summarization - Custom Drive ZIP
# ✅ Loads test.src.cleaned and test.tgt from ZIP in Google Drive
# ✅ Works even if ZIP already extracted
# ✅ Inline progress updates (same-line batch display)
# ✅ Saves summaries to Google Drive with resume support
# =============================================================

!pip install transformers datasets rouge-score accelerate --quiet

import torch, gc, os, pandas as pd, sys, zipfile
from transformers import AutoTokenizer, LEDForConditionalGeneration
import warnings, transformers

# =============================================================
# Mount Google Drive
# =============================================================
from google.colab import drive
drive.mount('/content/drive')

# =============================================================
# Config
# =============================================================
MODEL_NAME = "allenai/led-large-16384"
BATCH_SIZE = 1
CHUNK_SIZE = 50
LIMIT = 1000                        # Adjust for speed / testing
SPLIT = "test"                     # 👈 Focus only on TEST split
ZIP_PATH = "/content/drive/MyDrive/multi_news.zip"
EXTRACT_DIR = "/content/data_multinews"
SAVE_PATH = f"/content/drive/MyDrive/summaries_led_multinews_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

# =============================================================
# Load model + tokenizer
# =============================================================
torch.cuda.empty_cache(); gc.collect()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

try:
    model = LEDForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    ).to(device)
except RuntimeError:
    print("⚠️ GPU OOM → switching to CPU")
    model = LEDForConditionalGeneration.from_pretrained(MODEL_NAME).to("cpu")
    device = torch.device("cpu")

model.eval()

# =============================================================
# Load dataset from ZIP (test.src.cleaned + test.tgt)
# =============================================================
if not os.path.exists(EXTRACT_DIR):
    print("📦 Extracting dataset from Drive ZIP...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("✅ Extraction complete!")
else:
    print("♻️ Using already extracted data folder.")

# Find test.src.cleaned and test.tgt files
src_file, tgt_file = None, None
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f and f.endswith(".cleaned"):
            src_file = os.path.join(root, f)
        elif f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

if not src_file or not tgt_file:
    raise ValueError(f"❌ Could not find {SPLIT}.src.cleaned or {SPLIT}.tgt in extracted ZIP!")

print(f"✅ Found source file: {src_file}")
print(f"✅ Found target file: {tgt_file}")

# Load lines from files
with open(src_file, 'r', encoding='utf-8') as f:
    articles = [line.strip() for line in f.readlines()]

with open(tgt_file, 'r', encoding='utf-8') as f:
    references = [line.strip() for line in f.readlines()]

# Clip to same length + limit
min_len = min(len(articles), len(references))
articles = articles[:min_len][:LIMIT]
references = references[:min_len][:LIMIT]

print(f"📚 Loaded {len(articles)} test samples.")

# =============================================================
# Summarization helper
# =============================================================
def summarize_batch(texts):
    inputs = tokenizer(
        texts,
        max_length=16384,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=512,
            min_length=80,
            num_beams=2,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del inputs, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs

# =============================================================
# Save progress
# =============================================================
def save_progress(articles, refs, summaries, save_path):
    df_out = pd.DataFrame({
        "document": articles,
        "reference": refs,
        "summary": summaries
    })
    header = not os.path.exists(save_path)
    df_out.to_csv(save_path, mode='a', header=header, index=False)

# =============================================================
# Resume support
# =============================================================
start_idx = 0
if os.path.exists(SAVE_PATH):
    df_prev = pd.read_csv(SAVE_PATH)
    start_idx = len(df_prev)
    print(f"🔁 Resuming from index {start_idx}")

# =============================================================
# Clean up noisy LED padding warnings
# =============================================================
warnings.filterwarnings("ignore", message="Input ids are automatically padded")
transformers.logging.set_verbosity_error()

# =============================================================
# Main summarization loop (dynamic same-line batch updates)
# =============================================================
total_chunks = (len(articles) - start_idx + CHUNK_SIZE - 1) // CHUNK_SIZE
chunk_no = 1

for i in range(start_idx, len(articles), CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, len(articles))
    print(f"\n🚀 Chunk {chunk_no}/{total_chunks} → Samples {i}–{end}")

    batch_articles = articles[i:end]
    batch_refs = references[i:end]
    all_summaries = []

    total_batches = (len(batch_articles) + BATCH_SIZE - 1) // BATCH_SIZE

    for j in range(0, len(batch_articles), BATCH_SIZE):
        sub_batch = batch_articles[j:j + BATCH_SIZE]
        batch_no = (j // BATCH_SIZE) + 1

        try:
            batch_summaries = summarize_batch(sub_batch)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache(); gc.collect()
            batch_summaries = summarize_batch(sub_batch)

        all_summaries.extend(batch_summaries)

        # ✅ overwrite the same line (no newlines for each batch)
        sys.stdout.write(f"\r[Chunk {chunk_no}/{total_chunks}] [Batch {batch_no}/{total_batches}]")
        sys.stdout.flush()

    # move to new line after chunk finishes
    print()
    save_progress(batch_articles, batch_refs, all_summaries, SAVE_PATH)
    print(f"💾 Saved chunk {chunk_no}/{total_chunks} ({end}/{len(articles)} done)")

    chunk_no += 1
    torch.cuda.empty_cache(); gc.collect()

print(f"\n🎉 Test summarization complete! Saved to: {SAVE_PATH}")


Mounted at /content/drive
✅ Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

📦 Extracting dataset from Drive ZIP...
✅ Extraction complete!
✅ Found source file: /content/data_multinews/multi_news/test.src.cleaned
✅ Found target file: /content/data_multinews/multi_news/test.tgt
📚 Loaded 1000 test samples.
🔁 Resuming from index 600

🚀 Chunk 1/8 → Samples 600–650
[Chunk 1/8] [Batch 50/50]
💾 Saved chunk 1/8 (650/1000 done)

🚀 Chunk 2/8 → Samples 650–700
[Chunk 2/8] [Batch 50/50]
💾 Saved chunk 2/8 (700/1000 done)

🚀 Chunk 3/8 → Samples 700–750
[Chunk 3/8] [Batch 50/50]
💾 Saved chunk 3/8 (750/1000 done)

🚀 Chunk 4/8 → Samples 750–800
[Chunk 4/8] [Batch 50/50]
💾 Saved chunk 4/8 (800/1000 done)

🚀 Chunk 5/8 → Samples 800–850
[Chunk 5/8] [Batch 50/50]
💾 Saved chunk 5/8 (850/1000 done)

🚀 Chunk 6/8 → Samples 850–900
[Chunk 6/8] [Batch 50/50]
💾 Saved chunk 6/8 (900/1000 done)

🚀 Chunk 7/8 → Samples 900–950
[Chunk 7/8] [Batch 50/50]
💾 Saved chunk 7/8 (950/1000 done)

🚀 Chunk 8/8 → Samples 950–1000
[Chunk 8/8] [Batch 50/50]
💾 Saved chunk 8/8 (1000/1000 done)

🎉 Test summariz

In [None]:
# =============================================================
# ROUGE Evaluation (Fixed for 'evaluate' library)
# =============================================================

!pip install evaluate --quiet

import evaluate
import pandas as pd
import numpy as np

# Load your saved CSV (replace with your actual path if needed)
df = pd.read_csv(SAVE_PATH)
df = df.dropna(subset=["reference", "summary"])

# Initialize ROUGE metric
rouge = evaluate.load("rouge")

# Compute scores
results = rouge.compute(
    predictions=df["summary"].tolist(),
    references=df["reference"].tolist(),
    use_stemmer=True
)

# Extract F1 values (evaluate returns floats directly, not objects with .mid)
r1 = results["rouge1"]
r2 = results["rouge2"]
rL = results["rougeL"]

# Compute average precision across all three
avg_precision = np.mean([r1, r2, rL])

# Show neatly
def pct(x): return round(x * 100, 2)
print("\n================ ROUGE RESULTS ================")
print(f"ROUGE-1: {pct(r1)}")
print(f"ROUGE-2: {pct(r2)}")
print(f"ROUGE-L: {pct(rL)}")
print(f"Average Precision: {pct(avg_precision)}")
print("===============================================")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

Downloading builder script: 0.00B [00:00, ?B/s]


ROUGE-1: 17.14
ROUGE-2: 4.85
ROUGE-L: 9.62
Average Precision: 10.54


In [None]:
# =============================================================
# 🔍 ROUGE Evaluation for Validation Split (Updated for evaluate)
# =============================================================

!pip install evaluate --quiet

import evaluate
import pandas as pd
import numpy as np
import os

# =============================================================
# Paths (adjust for your validation run)
# =============================================================
SPLIT = "val"  # 👈 change to "test" if needed
SAVE_PATH = f"/content/drive/MyDrive/summaries_led_multinews_{SPLIT}.csv"
EXTRACT_DIR = "/content/data_multinews"

# =============================================================
# Load generated summaries (your model outputs)
# =============================================================
if not os.path.exists(SAVE_PATH):
    raise FileNotFoundError(f"❌ No saved summaries found at {SAVE_PATH}")

df_gen = pd.read_csv(SAVE_PATH)
df_gen = df_gen.dropna(subset=["summary"])
print(f"📄 Loaded {len(df_gen)} generated {SPLIT} summaries.")

# =============================================================
# Load ground-truth validation references (e.g., val.tgt)
# =============================================================
val_tgt = None
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.startswith(SPLIT) and "tgt" in f:
            val_tgt = os.path.join(root, f)

if not val_tgt:
    raise ValueError(f"❌ Could not find {SPLIT}.tgt file in dataset folder!")

with open(val_tgt, 'r', encoding='utf-8') as f:
    references = [line.strip() for line in f.readlines()]

# Ensure equal number of samples
min_len = min(len(df_gen), len(references))
preds = df_gen["summary"].tolist()[:min_len]
refs = references[:min_len]

# =============================================================
# Compute ROUGE
# =============================================================
rouge = evaluate.load("rouge")

results = rouge.compute(
    predictions=preds,
    references=refs,
    use_stemmer=True
)

# Extract ROUGE scores (already F1-based)
r1 = results["rouge1"]
r2 = results["rouge2"]
rL = results["rougeL"]

# Average precision
avg_precision = np.mean([r1, r2, rL])

# =============================================================
# Display
# =============================================================
def pct(x): return round(x * 100, 2)
print("\n================ VAL ROUGE RESULTS ================")
print(f"ROUGE-1: {pct(r1)}")
print(f"ROUGE-2: {pct(r2)}")
print(f"ROUGE-L: {pct(rL)}")
print(f"Average Precision: {pct(avg_precision)}")
print("===================================================")


📄 Loaded 1000 generated val summaries.

ROUGE-1: 16.77
ROUGE-2: 4.56
ROUGE-L: 9.5
Average Precision: 10.27


In [None]:
# =============================================================
# LED (Longformer Encoder–Decoder) Summarization - Custom Drive ZIP
# ✅ Loads test.src.cleaned and test.tgt from ZIP in Google Drive
# ✅ Works even if ZIP already extracted
# ✅ Inline progress updates (same-line batch display)
# ✅ Saves summaries to Google Drive with resume support
# =============================================================

!pip install transformers datasets rouge-score accelerate --quiet

import torch, gc, os, pandas as pd, sys, zipfile
from transformers import AutoTokenizer, LEDForConditionalGeneration
import warnings, transformers

# =============================================================
# Mount Google Drive
# =============================================================
from google.colab import drive
drive.mount('/content/drive')

# =============================================================
# Config
# =============================================================
MODEL_NAME = "allenai/led-large-16384"
BATCH_SIZE = 1
CHUNK_SIZE = 50
LIMIT = 1000                        # Adjust for speed / testing
SPLIT = "test"                     # 👈 Focus only on TEST split
ZIP_PATH = "/content/drive/MyDrive/multi_news.zip"
EXTRACT_DIR = "/content/data_multinews"
SAVE_PATH = f"/content/drive/MyDrive/summaries_led_multinews_{SPLIT}.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

# =============================================================
# Load model + tokenizer
# =============================================================
torch.cuda.empty_cache(); gc.collect()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

try:
    model = LEDForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    ).to(device)
except RuntimeError:
    print("⚠️ GPU OOM → switching to CPU")
    model = LEDForConditionalGeneration.from_pretrained(MODEL_NAME).to("cpu")
    device = torch.device("cpu")

model.eval()

# =============================================================
# Load dataset from ZIP (test.src.cleaned + test.tgt)
# =============================================================
if not os.path.exists(EXTRACT_DIR):
    print("📦 Extracting dataset from Drive ZIP...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("✅ Extraction complete!")
else:
    print("♻️ Using already extracted data folder.")

# Find test.src.cleaned and test.tgt files
src_file, tgt_file = None, None
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.startswith(SPLIT) and "src" in f and f.endswith(".cleaned"):
            src_file = os.path.join(root, f)
        elif f.startswith(SPLIT) and "tgt" in f:
            tgt_file = os.path.join(root, f)

if not src_file or not tgt_file:
    raise ValueError(f"❌ Could not find {SPLIT}.src.cleaned or {SPLIT}.tgt in extracted ZIP!")

print(f"✅ Found source file: {src_file}")
print(f"✅ Found target file: {tgt_file}")

# Load lines from files
with open(src_file, 'r', encoding='utf-8') as f:
    articles = [line.strip() for line in f.readlines()]

with open(tgt_file, 'r', encoding='utf-8') as f:
    references = [line.strip() for line in f.readlines()]

# Clip to same length + limit
min_len = min(len(articles), len(references))
articles = articles[:min_len][:LIMIT]
references = references[:min_len][:LIMIT]

print(f"📚 Loaded {len(articles)} test samples.")

# =============================================================
# Summarization helper
# =============================================================
def summarize_batch(texts):
    inputs = tokenizer(
        texts,
        max_length=16384,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=512,
            min_length=80,
            num_beams=2,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del inputs, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs

# =============================================================
# Save progress
# =============================================================
def save_progress(articles, refs, summaries, save_path):
    df_out = pd.DataFrame({
        "document": articles,
        "reference": refs,
        "summary": summaries
    })
    header = not os.path.exists(save_path)
    df_out.to_csv(save_path, mode='a', header=header, index=False)

# =============================================================
# Resume support
# =============================================================
start_idx = 0
if os.path.exists(SAVE_PATH):
    df_prev = pd.read_csv(SAVE_PATH)
    start_idx = len(df_prev)
    print(f"🔁 Resuming from index {start_idx}")

# =============================================================
# Clean up noisy LED padding warnings
# =============================================================
warnings.filterwarnings("ignore", message="Input ids are automatically padded")
transformers.logging.set_verbosity_error()

# =============================================================
# Main summarization loop (dynamic same-line batch updates)
# =============================================================
total_chunks = (len(articles) - start_idx + CHUNK_SIZE - 1) // CHUNK_SIZE
chunk_no = 1

for i in range(start_idx, len(articles), CHUNK_SIZE):
    end = min(i + CHUNK_SIZE, len(articles))
    print(f"\n🚀 Chunk {chunk_no}/{total_chunks} → Samples {i}–{end}")

    batch_articles = articles[i:end]
    batch_refs = references[i:end]
    all_summaries = []

    total_batches = (len(batch_articles) + BATCH_SIZE - 1) // BATCH_SIZE

    for j in range(0, len(batch_articles), BATCH_SIZE):
        sub_batch = batch_articles[j:j + BATCH_SIZE]
        batch_no = (j // BATCH_SIZE) + 1

        try:
            batch_summaries = summarize_batch(sub_batch)
        except torch.cuda.OutOfMemoryError:
            torch.cuda.empty_cache(); gc.collect()
            batch_summaries = summarize_batch(sub_batch)

        all_summaries.extend(batch_summaries)

        # ✅ overwrite the same line (no newlines for each batch)
        sys.stdout.write(f"\r[Chunk {chunk_no}/{total_chunks}] [Batch {batch_no}/{total_batches}]")
        sys.stdout.flush()

    # move to new line after chunk finishes
    print()
    save_progress(batch_articles, batch_refs, all_summaries, SAVE_PATH)
    print(f"💾 Saved chunk {chunk_no}/{total_chunks} ({end}/{len(articles)} done)")

    chunk_no += 1
    torch.cuda.empty_cache(); gc.collect()

print(f"\n🎉 Test summarization complete! Saved to: {SAVE_PATH}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

📦 Extracting dataset from Drive ZIP...
✅ Extraction complete!
✅ Found source file: /content/data_multinews/multi_news/test.src.cleaned
✅ Found target file: /content/data_multinews/multi_news/test.tgt
📚 Loaded 1000 test samples.
🔁 Resuming from index 1000

🎉 Test summarization complete! Saved to: /content/drive/MyDrive/summaries_led_multinews_test.csv


In [None]:
# =============================================================
# 🔍 Word-Level Average Precision for Validation & Test Splits
# =============================================================

!pip install nltk --quiet
import pandas as pd
import numpy as np
import os
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import nltk

# -------------------------------------------------------------
# 🔧 Fix NLTK punkt issue
# -------------------------------------------------------------
nltk.download('punkt')
nltk.download('punkt_tab')

# =============================================================
# Function to compute Average Precision (word-overlap)
# =============================================================
def compute_avg_precision(split_name, base_dir, extract_dir):
    print(f"\n🔹 Evaluating split: {split_name.upper()}")

    # Load generated summaries
    save_path = os.path.join(base_dir, f"summaries_led_multinews_{split_name}.csv")
    if not os.path.exists(save_path):
        print(f"❌ Skipping — file not found: {save_path}")
        return None

    df_gen = pd.read_csv(save_path)
    df_gen = df_gen.dropna(subset=["summary"])
    print(f"📄 Loaded {len(df_gen)} generated summaries for {split_name}.")

    # Load references (.tgt file)
    tgt_path = None
    for root, _, files in os.walk(extract_dir):
        for f in files:
            if f.startswith(split_name) and "tgt" in f:
                tgt_path = os.path.join(root, f)
                break
    if not tgt_path:
        print(f"❌ No {split_name}.tgt file found.")
        return None

    with open(tgt_path, 'r', encoding='utf-8') as f:
        refs = [line.strip() for line in f.readlines()]

    # Ensure matching length
    min_len = min(len(df_gen), len(refs))
    preds = df_gen["summary"].tolist()[:min_len]
    refs = refs[:min_len]

    # Word-level precision
    precision_list = []
    for ref, summ in tqdm(zip(refs, preds), total=len(refs), desc=f"{split_name} precision"):
        ref_tokens = word_tokenize(ref.lower())
        summ_tokens = word_tokenize(summ.lower())

        if len(summ_tokens) == 0:
            precision_list.append(0)
            continue

        ref_set = set(ref_tokens)
        summ_set = set(summ_tokens)
        overlap = len(ref_set.intersection(summ_set))
        precision = overlap / len(summ_set)
        precision_list.append(precision)

    avg_precision = np.mean(precision_list)
    print(f"✅ {split_name.upper()} Average Precision (word overlap): {round(avg_precision * 100, 2)}%")
    return avg_precision


# =============================================================
# Run for both Validation and Test
# =============================================================
BASE_DIR = "/content/drive/MyDrive"
EXTRACT_DIR = "/content/data_multinews"

val_precision = compute_avg_precision("val", BASE_DIR, EXTRACT_DIR)
test_precision = compute_avg_precision("test", BASE_DIR, EXTRACT_DIR)

print("\n================ FINAL PRECISION RESULTS ================")
if val_precision is not None:
    print(f"VAL Average Precision: {round(val_precision * 100, 2)}%")
if test_precision is not None:
    print(f"TEST Average Precision: {round(test_precision * 100, 2)}%")
print("=========================================================")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



🔹 Evaluating split: VAL
📄 Loaded 1000 generated summaries for val.


val precision: 100%|██████████| 1000/1000 [00:14<00:00, 71.30it/s]


✅ VAL Average Precision (word overlap): 40.67%

🔹 Evaluating split: TEST
📄 Loaded 1000 generated summaries for test.


test precision: 100%|██████████| 1000/1000 [00:16<00:00, 60.13it/s] 

✅ TEST Average Precision (word overlap): 39.97%

VAL Average Precision: 40.67%
TEST Average Precision: 39.97%





In [None]:
def summarize_batch(texts):
    inputs = tokenizer(
        texts,
        max_length=16384,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # ✅ LED-specific: global attention
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1   # CLS token gets global attention

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            max_length=512,
            min_length=80,
            num_beams=2,
            early_stopping=True
        )

    outputs = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    del input_ids, attention_mask, global_attention_mask, summary_ids
    torch.cuda.empty_cache(); gc.collect()
    return outputs
