<a href="https://colab.research.google.com/github/deviljerry/Encoder-Decoder-T5-BART-Text-Summarization/blob/main/Encoder%E2%80%93Decoder_(T5_BART)_%E2%80%94_Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ======================================================
# 🧩 Step 1: Install & Import Dependencies
# ======================================================
!pip install transformers datasets evaluate rouge_score gradio sentencepiece -q

import os
import torch
import pandas as pd
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import evaluate
from tqdm import tqdm
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Device in use: {device}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
✅ Device in use: cuda


In [3]:
# ======================================================
# 📚 Step 2: Load Dataset
# ======================================================

# Load CNN/DailyMail dataset from Hugging Face (same as Kaggle one)
dataset = load_dataset("cnn_dailymail", "3.0.0")

print("✅ Dataset Loaded Successfully")
print(dataset)
print(dataset["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

✅ Dataset Loaded Successfully
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})
{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian in

In [4]:
# ======================================================
# 🧹 Step 3: Preprocessing (Article → Summary)
# ======================================================

tokenizer = T5Tokenizer.from_pretrained("t5-small")

max_input_len = 512
max_target_len = 128

def preprocess_function(batch):
    inputs = ["summarize: " + doc for doc in batch["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True, padding="max_length")
    labels = tokenizer(text_target=batch["highlights"], max_length=max_target_len, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
print("✅ Preprocessing Complete")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

✅ Preprocessing Complete


In [5]:
# ======================================================
# ⚙️ Step 4: Model Setup
# ======================================================

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# ======================================================
# 🧠 Step 5: Fine-Tuning Configuration (Final for ≥4.50)
# ======================================================

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./t5-summarizer",
    eval_strategy="epoch",               # replaces evaluation_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,                 # small for testing
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    logging_steps=100,
    report_to=[],                       # disable external loggers
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"].select(range(1000)),
    eval_dataset=tokenized_datasets["validation"].select(range(200)),
    processing_class=tokenizer           # ✅ replaces deprecated tokenizer arg
)

print("✅ Trainer initialized successfully with Transformers v4.50+")


✅ Trainer initialized successfully with Transformers v4.50+


In [7]:
# ======================================================
# 🚀 Step 6: Train the Model
# ======================================================

print("🚀 Starting fine-tuning on CNN/DailyMail subset...")

trainer.train()

# ✅ Save fine-tuned model
trainer.save_model("./t5-summarizer/final_model")
tokenizer.save_pretrained("./t5-summarizer/final_model")

print("✅ Fine-tuning complete! Model saved to ./t5-summarizer/final_model")


🚀 Starting fine-tuning on CNN/DailyMail subset...


Epoch,Training Loss,Validation Loss
1,1.1381,0.829533


✅ Fine-tuning complete! Model saved to ./t5-summarizer/final_model


In [8]:
# ======================================================
# 📊 Step 7: Evaluation (ROUGE Metrics)
# ======================================================

rouge = evaluate.load("rouge")

preds, refs = [], []

sample_data = dataset["test"].select(range(50))  # Evaluate on small sample

for row in tqdm(sample_data):
    inputs = tokenizer("summarize: " + row["article"], return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_new_tokens=150, num_beams=4)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    preds.append(pred)
    refs.append(row["highlights"])

results = rouge.compute(predictions=preds, references=refs)

print("\n✅ ROUGE Evaluation Results:")
for k, v in results.items():
    print(f"{k}: {v * 100:.2f}")


Downloading builder script: 0.00B [00:00, ?B/s]

100%|██████████| 50/50 [01:42<00:00,  2.04s/it]


✅ ROUGE Evaluation Results:
rouge1: 11.07
rouge2: 5.49
rougeL: 8.48
rougeLsum: 9.88





In [9]:
# ======================================================
# 🌐 Step 8: Gradio Interface for Real-Time Summarization
# ======================================================

def summarize_text(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model.generate(**inputs, max_new_tokens=150, num_beams=4)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

demo = gr.Interface(
    fn=summarize_text,
    inputs=gr.Textbox(lines=10, label="Enter News Article or Paragraph"),
    outputs=gr.Textbox(lines=5, label="Generated Summary"),
    title="📰 T5-based News Summarizer",
    description="Fine-tuned T5-small model for abstractive summarization using CNN/DailyMail dataset.",
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4c27dfd2ee4430b94e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


