# Text Summarization

This notebook helps to understand how encoder-decoder models may be further finetuned for sequence to sequence tasks such as Summarization. 
In the example below, we will finetune a `facebook/bart-base` model on a news dataset such as [news-qa-summarization](https://huggingface.co/datasets/glnmario/news-qa-summarization). 

In [7]:
import os
import random
import numpy as np
import evaluate
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)



In [5]:
model_name = "facebook/bart-base"  # change to "facebook/bart-large-cnn" for better quality
max_input_length = 1024
max_target_length = 128
per_device_train_batch_size = 2  
per_device_eval_batch_size = 2  
num_epochs = 2  
learning_rate = 5e-5
seed = 42

set_seed(seed)



In [None]:
# Load dataset and inspect schema
raw_dataset = load_dataset("glnmario/news-qa-summarization")
print(raw_dataset)




Downloading readme: 100%|██████████| 715/715 [00:00<00:00, 2.67kB/s]
Downloading data: 100%|██████████| 40.7M/40.7M [00:06<00:00, 6.40MB/s]
Generating train split: 100%|██████████| 10388/10388 [00:00<00:00, 35749.54 examples/s]


DatasetDict({
    train: Dataset({
        features: ['story', 'questions', 'answers', 'summary'],
        num_rows: 10388
    })
})


In [9]:
raw_dataset["train"][0]

{'story': '\'SINDH KALAY\', England (CNN) -- The aroma of freshly baking flatbread wafts through the air as a unit of British soldiers position themselves for a quick patrol around the village of Sindh Kalay. A British soldier on patrol in the mock Afghan village of Sindh Kalay. Market vendors hawk grapes and melons, as a group of village elders sit smoking water pipes and suspicious-looking men lurk beside battered motorcycles. What should the soldiers do? Conduct a weapons search? Approach the village elders first? In the complex political and cultural terrain of Afghanistan, what is the best course of action? Except this is not Afghanistan. It\'s Norfolk, England. Instead of the Hindu Kush mountains, it is the green ladscape and tidy farmhouses of the English countryside that stretch out behind them. Welcome to the British Army\'s state-of-the art training ground. It cost more than $20 million to build and every British soldier serving in Afghanistan will do his or her training here

In [8]:
raw_dataset = load_dataset("glnmario/news-qa-summarization")

train_test = raw_dataset["train"].train_test_split(
    test_size=0.1, seed=42
)

train_val = train_test["train"].train_test_split(
    test_size=0.1111, seed=42
)

# 3) Rebuild a DatasetDict with 3 splits
dataset = DatasetDict({
    "train":      train_val["train"],
    "validation": train_val["test"],
    "test":       train_test["test"],
})

In [6]:
# Initialize tokenizer and model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Model params (M):", round(model.num_parameters() / 1e6, 2))



Tokenizer vocab size: 50265
Model params (M): 139.42


In [10]:
def preprocess_function(examples):
    # inputs: articles
    inputs = examples["story"]
    # targets: summaries
    targets = examples["summary"]

    # tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",  # or "longest" for on-the-fly padding
    )

    # tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length",
        )

    # Replace padding token id in labels with -100 so they are ignored by loss
    labels_ids = labels["input_ids"]
    labels_ids = [
        [(lid if lid != tokenizer.pad_token_id else -100) for lid in label]
        for label in labels_ids
    ]
    model_inputs["labels"] = labels_ids

    return model_inputs

In [12]:
dataset_tokenized = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/8310 [00:00<?, ? examples/s]

Map: 100%|██████████| 8310/8310 [00:32<00:00, 252.40 examples/s]
Map: 100%|██████████| 1039/1039 [00:04<00:00, 238.51 examples/s]
Map: 100%|██████████| 1039/1039 [00:03<00:00, 269.41 examples/s]


In [13]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
)

In [14]:
rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # replace -100 back to pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(
        predictions, skip_special_tokens=True
    )
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )

    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds, decoded_labels
    )

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    # average Rouge-L / Rouge-1 / Rouge-2
    result = {k: round(v * 100, 2) for k, v in result.items()}

    # also track average generated length
    prediction_lens = [
        np.count_nonzero(p != tokenizer.pad_token_id) for p in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result


Downloading builder script: 6.14kB [00:00, 3.02MB/s]


In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="bart-newsqa-sum",
    eval_strategy="epoch",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    per_device_train_batch_size=2,  
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    warmup_ratio=0.03,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    predict_with_generate=True,
    generation_max_length=max_target_length,
    gradient_accumulation_steps=8,   
    fp16=True,                      
    report_to="none",               
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


In [None]:

trainer.save_model("bart-newsqa-sum-final")
tokenizer.save_pretrained("bart-newsqa-sum-final")

In [None]:
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="bart-newsqa-sum-final",
    tokenizer="bart-newsqa-sum-final",
    device=0,  # or -1 for CPU
)

sample = raw_dataset["train"][0]["story"]
print(summarizer(sample, max_length=128, min_length=20, do_sample=False)[0]["summary_text"])


# Todo

1. Choose a different news story to summarize.
2. Compare the result with another encoder-decoder model such as T5. Which performs better? Discuss