In [1]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.2.5-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.2.5-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.3/802.3 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<4.0,>=1.13.0->lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12

In [2]:
!pip install transformers



In [3]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import lightning as pl

In [4]:
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

In [5]:
from torch.utils.data import Dataset, DataLoader
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint

from lightning.pytorch.loggers import TensorBoardLogger
from transformers import AdamW, T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer
from tqdm.auto import tqdm

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
pl.seed_everything(1234)

INFO: Seed set to 1234
INFO:lightning.fabric.utilities.seed:Seed set to 1234


1234

In [8]:
df = pd.read_csv("Final_training_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Content,Summarize
0,0,Microsoft has torn up the rules of big video g...,Call of Duty: Black Ops 6 will be available t...
1,1,"Game Pass, much like Sony's rival PlayStation ...","Game Pass, much like Sony's rival PlayStation..."
2,2,One element which is unclear in Microsoft's an...,"When Microsoft purchased Activision, it had t..."
3,3,A former takeaway worker found with Bitcoin wo...,"Jian Wen, 42, was involved in converting the ..."
4,4,Listen to the best of BBC Radio London on Soun...,BBC Radio London is a weekly feature on BBC R...


In [9]:
df = df[['Content', 'Summarize']]
df.head()

Unnamed: 0,Content,Summarize
0,Microsoft has torn up the rules of big video g...,Call of Duty: Black Ops 6 will be available t...
1,"Game Pass, much like Sony's rival PlayStation ...","Game Pass, much like Sony's rival PlayStation..."
2,One element which is unclear in Microsoft's an...,"When Microsoft purchased Activision, it had t..."
3,A former takeaway worker found with Bitcoin wo...,"Jian Wen, 42, was involved in converting the ..."
4,Listen to the best of BBC Radio London on Soun...,BBC Radio London is a weekly feature on BBC R...


In [10]:
df.shape

(2331, 2)

In [11]:
train_df, test_df = train_test_split(df, test_size=0.1)
print(f"Shape of the Train Set: {train_df.shape}\nShape of the Test Set: {test_df.shape}")

Shape of the Train Set: (2097, 2)
Shape of the Test Set: (234, 2)


In [12]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, text_max_token_len=512, summary_max_token_len=128):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        text = data_row["Content"]

        # Encode the text
        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        # Encode the summary
        summary_encoding = self.tokenizer(
            data_row["Summarize"],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        # Modify the labels so that the model knows which tokens to predict
        labels = summary_encoding['input_ids']
        labels[labels == 0] = -100

        return {
            'text': text,
            'summary': data_row['Summarize'],
            'text_input_ids': text_encoding['input_ids'].flatten(),
            'text_attention_mask': text_encoding['attention_mask'].flatten(),
            'labels': labels.flatten(),
            'labels_attention_mask': summary_encoding["attention_mask"].flatten()
        }

In [13]:
class NewsDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_df,
                 test_df,
                 tokenizer,
                 batch_size=8,
                 text_max_token_len=512,
                 summary_max_token_len=128):
        super().__init__()

        self.train_df = train_df
        self.test_df = test_df

        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):
        self.train_dataset = NewsDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len)

        self.test_dataset = NewsDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False
        )

This is a Python class named "NewsDataModule", which extends the PyTorch Lightning "LightningDataModule" class, and it takes as input two pandas DataFrame objects containing news articles data for the train and test sets, a tokenizer object, and maximum token lengths for both the text and the summary.

The class has five methods:

init(self, train_df, test_df, tokenizer, batch_size=8, text_max_token_len=152, summary_max_token_len=128): This method initializes the class object and sets the instance variables. It takes the following arguments:
1. train_df: a pandas DataFrame object containing news articles data for the training set, with columns "text" and "summary".
2. test_df: a pandas DataFrame object containing news articles data for the test set, with columns "text" and "summary".
3. tokenizer: a tokenizer object from the Hugging Face library that is used to tokenize the text and summary data.
4. batch_size: an optional integer that sets the batch size for the dataloaders.
5. text_max_token_len: an optional integer that sets the maximum number of tokens to use for the text data.
6. summary_max_token_len: an optional integer that sets the maximum number of tokens to use for the summary data.


setup(self, stage=None): This method sets up the datasets for the train, validation, and test sets:
1. Creates the train dataset by initializing a NewsDataset object with the train DataFrame, tokenizer, and maximum token lengths.
2. Creates the test dataset by initializing a NewsDataset object with the test DataFrame, tokenizer, and maximum token lengths.

train_dataloader(self): This method returns a DataLoader object for the train dataset:
1. Returns a DataLoader object with the train dataset, the batch size, and shuffles the data.

Others are same as train_dataloader(self)

In [14]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [15]:
text_token_counts = [len(tokenizer.encode(row["Content"])) for _, row in train_df.iterrows()]
summary_token_counts = [len(tokenizer.encode(row["Summarize"])) for _, row in train_df.iterrows()]

In the above code, calculate the number of tokens in each text and summary data in the training set using the provided tokenizer object, and stores the results in separate lists. These token counts can be used to determine appropriate maximum token lengths for the text and summary when setting up the NewsDataModule object.

* Create Two empty lists named text_token_counts and summary_token_counts. It then iterates over each row of the train_df DataFrame using the iterrows() method.

* For each row, it calculates the number of tokens in the "Content" column by calling the encode() method of the tokenizer object and passing in the text as an argument. The number of tokens is then appended to the text_token_counts list.

* Similarly, it calculates the number of tokens in the "Summarize" column by calling the encode() method of the tokenizer object and passing in the summary as an argument. The number of tokens is then appended to the summary_token_counts list.

In [16]:
N_EPOCHS = 5
BATCH_SIZE=8

data_module = NewsDataModule(
    train_df,
    test_df,
    tokenizer,
    batch_size=BATCH_SIZE,
    text_max_token_len = max(text_token_counts),
    summary_max_token_len = max(summary_token_counts),
)

In [17]:
class SummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def shared_step(self, batch, batch_idx, stage):
        input_ids = batch['text_input_ids']
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, _ = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log(f"{stage}_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self.shared_step(batch, batch_idx, 'train')

    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch, batch_idx, 'val')

    def test_step(self, batch, batch_idx):
        return self.shared_step(batch, batch_idx, 'test')

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

The init method initializes the T5 model by loading the pre-trained weights from the specified MODEL_NAME and setting return_dict=True to enable the module to return a dictionary of model outputs.

The forward method takes input_ids, attention_mask, decoder_attention_mask, and labels as input and performs forward propagation of the T5 model. The output of the method is a tuple of the loss and logits.

The shared_step method is a helper function that takes in a batch, batch index, and a stage (train, val, or test) and computes the loss for that batch. It uses the forward method to get the loss and logs the loss to the PyTorch Lightning logger using self.log(). The method returns the loss.

The training_step, validation_step, and test_step methods call the shared_step method with the appropriate stage and return the loss.

The configure_optimizers method sets up the optimizer used to update the model's parameters during training. In this case, it returns an AdamW optimizer with a learning rate of 0.0001.

In [18]:
model_1 = SummaryModel()

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [19]:
callbacks = ModelCheckpoint(
    dirpath="/kaggle/working/checkpoints",
    filename="base-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode='min'
)

logger = TensorBoardLogger("lightning_logs", name="news_summary")

trainer= Trainer(
    logger=logger,
    callbacks=callbacks,
    max_epochs=N_EPOCHS,
    accelerator = "auto"
)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [20]:
trainer.fit(model_1, data_module)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 0, global step 263: 'val_loss' reached 0.00026 (best 0.00026), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 0, global step 263: 'val_loss' reached 0.00026 (best 0.00026), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 1, global step 526: 'val_loss' reached 0.00010 (best 0.00010), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 1, global step 526: 'val_loss' reached 0.00010 (best 0.00010), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 2, global step 789: 'val_loss' reached 0.00006 (best 0.00006), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 2, global step 789: 'val_loss' reached 0.00006 (best 0.00006), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 3, global step 1052: 'val_loss' reached 0.00004 (best 0.00004), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 3, global step 1052: 'val_loss' reached 0.00004 (best 0.00004), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 4, global step 1315: 'val_loss' reached 0.00003 (best 0.00003), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 4, global step 1315: 'val_loss' reached 0.00003 (best 0.00003), saving model to '/kaggle/working/checkpoints/base-checkpoint.ckpt' as top 1
INFO: `Trainer.fit` stopped: `max_epochs=5` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [21]:
best_model = SummaryModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)
best_model.freeze()

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model.to(device)

SummaryModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_featur

In [23]:
def encode_text(text):
    # Encode the text using the tokenizer
    encoding = tokenizer.encode_plus(
        text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    encoding.to(device)
    return encoding["input_ids"], encoding["attention_mask"]

def generate_summary(input_ids, attention_mask):
    # Generate a summary using the best model
    generated_ids = best_model.model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )
    return generated_ids

def decode_summary(generated_ids):
    # Decode the generated summary
    summary = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
               for gen_id in generated_ids]
    return "".join(summary)

def summarize(text):
    input_ids, attention_mask = encode_text(text)
    generated_ids = generate_summary(input_ids, attention_mask)
    summary = decode_summary(generated_ids)
    return summary

In [24]:
sample_row = test_df.iloc[3]
text = sample_row["Content"]
model_summary = summarize(text)

In [25]:
text

'A care home is hoping to reawaken memories by cooking up nostalgic recipes for people with dementia.The Chase Care Home, in Huntingdon, Cambridgeshire, is putting together a cookbook of its residents\' favourite family recipes - from trifles to toad-in-the-hole to steamed puddings.In doing so, they hope to raise funds for the Alzheimer\'s Society.The care home\'s head chef, Juliana Martins, said inviting residents to share their memories and recipes and help in the kitchen ensured "these things are not forgotten".'

In [26]:
sample_row["Summarize"]

' Chase Care Home in Huntingdon, Cambridgeshire, is putting together a cookbook of its residents\' favourite family recipes . They hope to raise funds for the Alzheimer\'s Society in doing so . Head chef Juliana Martins: "These things are not forgotten"'

In [27]:
model_summary

'Chase Care Home in Huntingdon, Cambridgeshire, is putting together a cookbook of its residents\' favourite family recipes. They hope to raise funds for the Alzheimer\'s Society in doing so. Head chef Juliana Martins: "These things are not forgotten"'

In [31]:
text = "A Suffolk computer expert is predicting that artificial intelligence (AI) will replace human coders in 10 years' time.Creative Computing Club founder Matthew Applegate said: I'm teaching the last generation of coders. We are looking at being knocked out by AI very soon.Mr Applegate started the club in 2012 with the ethos 50% work 50% play.AI will creep into everything; pop music, writing... plumbers are safe! Technology is always an interesting ride, he said.His prediction comes as a House of Lords committee says that we should embrace the positives of AI rather than just focus on the risks.The Communications and Digital Committee's report looked at large language models (LLMs), which are what power generative AI tools like ChatGPT. Mr Applegate said: They trained AI on a thing called GitHub, which is an online repository of all the best examples of code, and some of the worst. So it was able to determine the best practices very early on.So we've bypassed a lot of those problems very quickly. We've got probably about ten more years of teaching code."

In [32]:
model_summary = summarize(text)
model_summary

"Creative Computing Club founder Matthew Applegate said: I'm teaching the last generation of coders. He started the club in 2012 with the ethos 50% work 50% play. AI will creep into everything; pop music, writing... plumbers are safe!"