<a href="https://colab.research.google.com/github/dorobat-diana/AI-Laboratory/blob/main/T5Base_BBC_News_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score



## Imports

In [None]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [None]:
pp = pprint.PrettyPrinter()

## Prepare Dataset

In [None]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')

In [None]:
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

In [None]:
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

In [None]:
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


## Dataset Analysis

In [None]:
def find_longest_length(dataset):
    """
    Find the longest article and summary in the entire training set.
    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(dataset_train['Articles'])
print(f"Longest article length: {longest_article_length} words")
print(f"Artciles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 500 words: {counter_500}")
longest_summary_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(dataset_train['Summaries'])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 500 words: {counter_500}")

Longest article length: 4377 words
Artciles larger than 4000 words: 1
Artciles larger than 2000 words: 6
Artciles larger than 1000 words: 17
Artciles larger than 500 words: 352
Longest summary length: 2073 words
Summaries larger than 4000 words: 0
Summaries larger than 2000 words: 1
Summaries larger than 1000 words: 6
Summaries larger than 500 words: 14


In [None]:
def find_avg_sentence_length(dataset):
    """
    Find the average sentence in the entire training set.
    """
    sentence_lengths = []
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        sentence_lengths.append(len(corpus))
    return sum(sentence_lengths)/len(sentence_lengths)

avg_article_length = find_avg_sentence_length(dataset_train['Articles'])
print(f"Average article length: {avg_article_length} words")
avg_summary_length = find_avg_sentence_length(dataset_train['Summaries'])
print(f"Averrage summary length: {avg_summary_length} words")

Average article length: 378.96795952782463 words
Averrage summary length: 165.33670601461495 words


## Configurations

In [None]:
MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

## Tokenization

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

In [None]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

Map (num_proc=4):   0%|          | 0/1779 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/445 [00:00<?, ? examples/s]



## Model

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

222,903,552 total parameters.
222,903,552 training parameters.


## ROUGE Metric

In [None]:
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

## Training

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

history = trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.6924,0.380391,0.8997,0.8269,0.8817,232.2697
400,0.3787,0.33495,0.9081,0.8389,0.8909,233.2022
600,0.4164,0.317275,0.9104,0.8425,0.8942,233.2045
800,0.3701,0.309902,0.9127,0.8463,0.8965,233.2045
1000,0.3709,0.305184,0.9142,0.8488,0.8981,233.2045
1200,0.3922,0.301668,0.9152,0.8499,0.8991,233.2045
1400,0.4175,0.29809,0.9162,0.8517,0.8999,233.2045
1600,0.2688,0.297567,0.9162,0.8531,0.9007,233.2045
1800,0.3447,0.297181,0.917,0.8529,0.9011,233.2045
2000,0.2142,0.295671,0.9175,0.8552,0.9023,233.2045




Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.6924,0.380391,0.8997,0.8269,0.8817,232.2697
400,0.3787,0.33495,0.9081,0.8389,0.8909,233.2022
600,0.4164,0.317275,0.9104,0.8425,0.8942,233.2045
800,0.3701,0.309902,0.9127,0.8463,0.8965,233.2045
1000,0.3709,0.305184,0.9142,0.8488,0.8981,233.2045
1200,0.3922,0.301668,0.9152,0.8499,0.8991,233.2045
1400,0.4175,0.29809,0.9162,0.8517,0.8999,233.2045
1600,0.2688,0.297567,0.9162,0.8531,0.9007,233.2045
1800,0.3447,0.297181,0.917,0.8529,0.9011,233.2045
2000,0.2142,0.295671,0.9175,0.8552,0.9023,233.2045




In [None]:
tokenizer.save_pretrained(OUT_DIR)

('results_t5base/tokenizer_config.json',
 'results_t5base/special_tokens_map.json',
 'results_t5base/spiece.model',
 'results_t5base/added_tokens.json')

In [None]:
!zip -r {OUT_DIR} {OUT_DIR}

  adding: results_t5base/ (stored 0%)
  adding: results_t5base/added_tokens.json (deflated 83%)
  adding: results_t5base/events.out.tfevents.1741630367.608fdc570701.1211.0 (deflated 70%)
  adding: results_t5base/special_tokens_map.json (deflated 85%)
  adding: results_t5base/spiece.model (deflated 48%)
  adding: results_t5base/checkpoint-4005/ (stored 0%)
  adding: results_t5base/checkpoint-4005/trainer_state.json (deflated 84%)
  adding: results_t5base/checkpoint-4005/scheduler.pt (deflated 55%)
  adding: results_t5base/checkpoint-4005/model.safetensors (deflated 8%)
  adding: results_t5base/checkpoint-4005/optimizer.pt (deflated 8%)
  adding: results_t5base/checkpoint-4005/training_args.bin (deflated 51%)
  adding: results_t5base/checkpoint-4005/generation_config.json (deflated 29%)
  adding: results_t5base/checkpoint-4005/rng_state.pth (deflated 25%)
  adding: results_t5base/checkpoint-4005/config.json (deflated 62%)
  adding: results_t5base/checkpoint-4450/ (stored 0%)
  adding: re

## Inference

In [None]:
# Download data.
!wget "https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1" -O inference_data.zip

In [None]:
!unzip inference_data.zip

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

import glob

In [None]:
model_path = f"{OUT_DIR}/checkpoint-4450"  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)

In [None]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=num_beams,
        # early_stopping=True,
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
for file_path in glob.glob('inference_data/*.txt'):
    file = open(file_path)
    text = file.read()
    summary = summarize_text(text, model, tokenizer)
    pp.pprint(summary)
    print('-'*75)

('Sam Altman — the leader of one of the world’s most influential AI companies, '
 'OpenAI, and perhaps the most visible figure in the space — was fired Friday '
 'night by the startup’s board in a surprise move.')
---------------------------------------------------------------------------
('Microsoft has hired Sam Altman to power up its innovation in artificial '
 'intelligence after the co-founder of OpenAI was ousted as CEO in a chaotic '
 'boardroom coup on Friday. Brockmann quit as OpenAI president after Altman '
 'was fired')
---------------------------------------------------------------------------
