In [292]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset




In [293]:
Data=pd.read_csv('news_summary.csv', encoding='latin-1', engine='python') # if any error occurs then it will not goes to next lines...

In [294]:
Data.head(5)

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [295]:
Data.shape

(4514, 6)

In [296]:
Data.columns

Index(['author', 'date', 'headlines', 'read_more', 'text', 'ctext'], dtype='object')

In [297]:
Data.dtypes

author       object
date         object
headlines    object
read_more    object
text         object
ctext        object
dtype: object

In [298]:
len(Data)

4514

In [299]:
Data=Data[['text', 'ctext']]
Data.columns=['Summary', 'Text']
Data.dropna()
Data.head(5)

Unnamed: 0,Summary,Text
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [300]:
Data.shape

(4514, 2)

In [301]:
Data['Text'][0]

'The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ? one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ? were issued by the Dama

In [302]:
len(Data['Text'][0])

2313

In [303]:
Data['Summary'][23]

'Investigators have released pictures showing how close an Air Canada flight came to crashing onto four planes at a US airport last month. Its pilots mistakenly descended towards a taxiway, where four planes were parked, instead of the runway before aborting the landing. Investigators said the incident came within a few feet of becoming one of the worst aviation disasters.'

In [304]:
len(Data['Summary'][23])

374

In [305]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [306]:
!pip install pytorch_lightning

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [307]:
import pytorch_lightning as pl 

In [308]:
import torch

In [309]:
from torch.utils.data import Dataset, DataLoader

import re

In [310]:
class NewsDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, source_len, target_len):
        self.source_texts=source_texts
        self.target_texts=target_texts
        self.tokenizer=tokenizer
        self.source_len=source_len
        self.target_len=target_len
    def __len__(self):
        return len(self.target_texts)-1
    def __getitem__(self, idx):
        whitespace_handlers=lambda k:re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
        text=' '.join(str(self.source_texts[idx]).split())
        summary=' '.join(str(self.target_texts[idx]).split())
        
        # handles whitespaces for text and padding will applied for max_legth(add's 0's), special characters are handled. returning tensors pytorch
        source=self.tokenizer.batch_encode_plus([whitespace_handlers(text)],
                                                max_length=self.source_len,
                                                padding='max_length',
                                                truncation=True,
                                                return_attention_mask=True,
                                                add_special_tokens=True,
                                                return_tensors='pt')
        
        target=self.tokenizer.batch_encode_plus([whitespace_handlers(summary)],
                                                max_length=self.target_len,
                                                padding='max_length',
                                                truncation=True,
                                                return_attention_mask=True,
                                                add_special_tokens=True,
                                                return_tensors='pt')
        
        labels=target['input_ids']
        labels[labels==0]=-100  # Replace labels==0 to -100
        #[[2,2]] -> after squeezing [2,2]
        return (source['input_ids'].squeeze(),
                source['attention_mask'].squeeze(),
                labels.squeeze(),
                target['attention_mask'].squeeze())
        
        
        
        

In [311]:
class NewsDataLoader(pl.LightningDataModule):
    def __init__(self, file_path, tokenizer, batch_size, val_split_size,
                 column_name, source_len=1024, target_len=128, corpus_size=1000):
        super().__init__()
        self.file_path=file_path
        self.tokenizer=tokenizer
        self.batch_size=batch_size
        self.split_size=val_split_size
        self.column_name=column_name
        self.source_len=source_len
        self.target_len=target_len
        self.nrows=corpus_size
        
    def prepare_data(self):
        data=pd.read_csv(self.file_path, encoding='latin-1', nrows=self.nrows)
        data=data[self.column_name]
        data=data.dropna()
        self.target_texts=data.iloc[:,0].values
        self.source_texts=data.iloc[:, -1].values
        
        
    def setup(self, stage=None):
        X_train, X_val, y_train, y_val=train_test_split(self.source_texts, self.target_texts, test_size=self.split_size)
        self.train_dataset=(X_train, y_train)
        self.val_dataset=(X_val, y_val)
        
    def train_dataloader(self):
        train_data=NewsDataset(source_texts=self.train_dataset[0],
                               target_texts=self.train_dataset[1],
                               tokenizer=self.tokenizer,
                               source_len=self.source_len,
                               target_len=self.target_len)
        return DataLoader(train_data, self.batch_size, num_workers=0, shuffle=True, pin_memory=True)
    
    def val_dataloader(self):
        val_data=NewsDataset(source_texts=self.val_dataset[0],
                             target_texts=self.val_dataset[1],
                             tokenizer=self.tokenizer,
                             source_len=self.source_len,
                             target_len=self.target_len)
        return DataLoader(val_data, self.batch_size, num_workers=0, pin_memory=True)
        

In [312]:
class fintuner(pl.LightningModule):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.training_losses = []   # store losses per batch
        self.validation_losses = [] # store val losses per batch

    def forward(self, input_ids, attention_mask, decoder_attention_mask=None, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )
        return outputs.loss 
    
    def _step(self, batch):
        source_input_ids, source_attention_mask, target_input_ids, target_attention_mask = batch
        loss = self(
            input_ids=source_input_ids,
            attention_mask=source_attention_mask,
            decoder_attention_mask=target_attention_mask,
            labels=target_input_ids
        )
        return loss
    
    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.training_losses.append(loss.detach())   # save batch loss
        return {"loss": loss}
    
    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.validation_losses.append(loss.detach()) # save batch val loss
        return {"val_loss": loss}
    
    def on_train_epoch_end(self):
        if self.training_losses:  # aggregate batch losses
            avg_loss = torch.stack(self.training_losses).mean()
            self.log('train_loss', avg_loss, prog_bar=True, logger=True)
            self.training_losses.clear()
    
    def on_validation_epoch_end(self):
        if self.validation_losses:
            avg_val_loss = torch.stack(self.validation_losses).mean()
            self.log('val_loss', avg_val_loss, prog_bar=True, logger=True)
            self.validation_losses.clear()
        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer, mode='min', factor=0.1, patience=3
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'val_loss'
        }
   

In [313]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer=AutoTokenizer.from_pretrained('t5-small')

model=AutoModelForSeq2SeqLM.from_pretrained('t5-small')


In [314]:
dataloader=NewsDataLoader(tokenizer=tokenizer, file_path='news_summary.csv', batch_size=4,
                          val_split_size=0.3, column_name=['text', 'ctext'])

dataloader.prepare_data()
dataloader.setup()

In [315]:
model=fintuner(model=model, tokenizer=tokenizer)


In [316]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

In [317]:
checkpoint_callbacks=ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

logger=TensorBoardLogger('lightning_logs', name='summary')



In [318]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
early_stop_callback=EarlyStopping(monitor='val_loss', patience=5, verbose=False, mode='min')

In [319]:
trainer=pl.Trainer(check_val_every_n_epoch=1, max_epochs=1, accelerator='gpu', callbacks=[early_stop_callback, checkpoint_callbacks],
                   logger=logger)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [320]:
torch.cuda.empty_cache()
if __name__ == "__main__":   # ✅ add this guard
    trainer.fit(model, dataloader)


  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
0         Modules in train mode
277       Modules in eval mode


Epoch 0: 100%|██████████| 173/173 [11:55<00:00,  0.24it/s, v_num=4, val_loss=1.800]

Epoch 0, global step 173: 'val_loss' reached 1.80413 (best 1.80413), saving model to '/Users/siddanthapusandeep/Data Science/Projects/Text-Summarization/checkpoints/best-checkpoint.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 173/173 [12:01<00:00,  0.24it/s, v_num=4, val_loss=1.800]
