## DA5 - Text Summarization using BERT
Goutham Krishnan 21BAI1007

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
class NewsDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, input_length, summary_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.input_length = input_length
        self.summary_length = summary_length
        self.summary = self.data.text
        self.article = self.data.ctext

    def __len__(self):
        return len(self.summary)

    def __getitem__(self, idx):
        article = str(self.article[idx])
        article = ' '.join(article.split())

        summary = str(self.summary[idx])
        summary = ' '.join(summary.split())

        source = self.tokenizer.batch_encode_plus(
            [article], 
            max_length=self.input_length, 
            pad_to_max_length=True,
            return_tensors='pt'
        )
        
        target = self.tokenizer.batch_encode_plus(
            [summary], 
            max_length=self.summary_length, 
            pad_to_max_length=True,
            return_tensors='pt'
        )

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [3]:
def train_epoch(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for batch_idx, batch in enumerate(loader, 0):
        target_ids = batch['target_ids'].to(device, dtype=torch.long)
        input_ids = batch['source_ids'].to(device, dtype=torch.long)
        attention_mask = batch['source_mask'].to(device, dtype=torch.long)
        
        decoder_input = target_ids[:, :-1].contiguous()
        
        labels = target_ids[:, 1:].clone().detach()
        labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input,
            labels=labels
        )
        loss = outputs[0]
        
        if batch_idx % 500 == 0:
            print(f'Epoch: {epoch}, Loss: {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [4]:
def generate_summaries(epoch, tokenizer, model, device, loader):
    model.eval()
    generated_summaries = []
    reference_summaries = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(loader, 0):
            target_ids = batch['target_ids'].to(device, dtype=torch.long)
            input_ids = batch['source_ids'].to(device, dtype=torch.long)
            attention_mask = batch['source_mask'].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )
            
            predictions = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) 
                         for g in generated_ids]
            actual = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                     for t in target_ids]
            
            if batch_idx % 100 == 0:
                print(f'Processed {batch_idx} batches')

            generated_summaries.extend(predictions)
            reference_summaries.extend(actual)
            
    return generated_summaries, reference_summaries

In [5]:
CONFIG = {
    'train_batch_size': 2,
    'valid_batch_size': 2,
    'train_epochs': 2,
    'val_epochs': 1,
    'learning_rate': 1e-4,
    'max_input_length': 512,
    'max_summary_length': 150,
    'random_seed': 42
}

torch.manual_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])
torch.backends.cudnn.deterministic = True

In [6]:
# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
    
# Load and preprocess data
df = pd.read_csv('/kaggle/input/news-summary/news_summary.csv', encoding='latin-1')
df = df[['text', 'ctext']]
df.ctext = 'summarize: ' + df.ctext
    
# Split data
train_size = 0.8
train_data = df.sample(frac=train_size, random_state=CONFIG['random_seed']).reset_index(drop=True)
val_data = df.drop(train_data.index).reset_index(drop=True)
    
# Create datasets
train_dataset = NewsDataset(train_data, tokenizer, CONFIG['max_input_length'], CONFIG['max_summary_length'])
val_dataset = NewsDataset(val_data, tokenizer, CONFIG['max_input_length'], CONFIG['max_summary_length'])
    
# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG['train_batch_size'],
    shuffle=True,
    num_workers=0
)
    
val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG['valid_batch_size'],
    shuffle=False,
    num_workers=0
)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['learning_rate'])
    
# Training loop
print('Starting model fine-tuning...')
for epoch in range(CONFIG['train_epochs']):
    train_epoch(epoch, tokenizer, model, device, train_loader, optimizer)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Starting model fine-tuning...




Epoch: 0, Loss: 7.238336086273193
Epoch: 0, Loss: 1.5305217504501343
Epoch: 0, Loss: 1.5214146375656128
Epoch: 0, Loss: 1.441329002380371
Epoch: 1, Loss: 2.829712390899658
Epoch: 1, Loss: 1.1933395862579346
Epoch: 1, Loss: 1.338054895401001
Epoch: 1, Loss: 1.1567189693450928


In [8]:
## Save the model
torch.save(model.state_dict(), "summarization_mode.pth")

In [9]:
print('Generating summaries for validation set...')
for epoch in range(CONFIG['val_epochs']):
    predictions, actuals = generate_summaries(epoch, tokenizer, model, device, val_loader)
    results_df = pd.DataFrame({'Generated Text': predictions, 'Actual Text': actuals})
    results_df.to_csv('predictions.csv')
    print('Results saved to predictions.csv')

Generating summaries for validation set...




Processed 0 batches
Processed 100 batches
Processed 200 batches
Processed 300 batches
Processed 400 batches
Results saved to predictions.csv


### Generate sample summaries

In [13]:
for i in range(10):
    print(f"Text:\n{results_df.loc[i, 'Actual Text']}")
    print(f"Summary:\n{results_df.loc[i, 'Generated Text']}")
    print() 

Text:
PM Narendra Modi on Thursday launched Ude Desh ka Aam Nagrik (UDAN) scheme for regional flight connectivity by flagging off the inaugural flight from Shimla to Delhi. Under UDAN, government will connect small towns by air with 50% plane seats' fare capped at?2,500 for a one-hour journey of 500 kilometres. UDAN will connect over 45 unserved and under-served airports.
Summary:
the first UDAN flight took off from Shimla on Monday after being flagged off by Prime Minister Narendra Modi. The flight will be operated by Alliance Air, the regional arm of Air India. The scheme seeks to make flying more affordable for the common people.

Text:
BJP chief Amit Shah on Wednesday slammed Delhi Chief Minister Arvind Kejriwal after AAP blamed EVMs for their loss in the MCD polls. "Let Kejriwal first tell the public how he won in Delhi in 2015 with these same EVMs..." Shah said. "UPA one and two had also won elections with the same EVMs," the BJP leader added.
Summary:
BJP chief Amit Shah on Wedn