Importing modules

In [3]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq torch --progress-bar off
!pip install -qqq tqdm --progress-bar off
!pip install -qqq transformers --progress-bar off
!pip install -qqq peft --progress-bar off
!pip install -qqq optuna --progress-bar off
!pip install -qqq sentencepiece --progress-bar off
!pip install -qqq rouge --progress-bar off
!pip install -qqq --upgrade jupyter --progress-bar off
!pip install -qqq jupyterlab==4.1.1 --progress-bar off

In [4]:
import pandas as pd
import torch
import os
from tqdm import tqdm
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, AdamW, PegasusForConditionalGeneration, PegasusTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge

Defining custom dataset class

In [6]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024, max_output_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["article"]
        summary = self.data.iloc[index]["highlights"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}


Loading the dataset

In [7]:
train_df = pd.read_csv("../../datasets/train_1k.csv")
test_df = pd.read_csv("../../datasets/test_100.csv")
val_df = pd.read_csv("../../datasets/validation_1k.csv")

Loading the pegasus tokenizer and model

In [8]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining the training and validation datasets

In [9]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

Defining collate function

In [10]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}


Defining the train and val dataloaders

In [11]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1,collate_fn=collate_fn)

Define the LoRA config

In [12]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["k_proj", "v_proj", "q_proj", "out_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ",
)

model = get_peft_model(model, config)

Defining the optimizer and learning rate scheduler

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=10000)

Guaging the steps per epoch as per the batch size

In [14]:
len(train_loader),len(val_loader)

(1000, 1000)

Fine tuning the model

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(10):
    train_loss = 0
    for step,batch in tqdm(enumerate(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for step,batch in tqdm(enumerate(val_loader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch).loss
            val_loss += loss.item()
            
        val_loss /= len(val_loader)
    model.train()
    print(f"Epoch {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f}")

1000it [09:49,  1.70it/s]
1000it [04:17,  3.89it/s]


Epoch 1 train loss: 11.4501 val loss: 10.4625


1000it [09:51,  1.69it/s]
1000it [04:17,  3.89it/s]


Epoch 2 train loss: 10.0071 val loss: 9.4445


1000it [09:52,  1.69it/s]
1000it [04:17,  3.89it/s]


Epoch 3 train loss: 9.2176 val loss: 8.2180


1000it [09:52,  1.69it/s]
1000it [04:17,  3.89it/s]


Epoch 4 train loss: 8.3542 val loss: 7.7465


1000it [09:52,  1.69it/s]
1000it [04:17,  3.89it/s]


Epoch 5 train loss: 7.9649 val loss: 7.4103


1000it [09:52,  1.69it/s]
1000it [04:17,  3.89it/s]


Epoch 6 train loss: 7.6774 val loss: 7.1344


1000it [09:52,  1.69it/s]
1000it [04:17,  3.88it/s]


Epoch 7 train loss: 7.4424 val loss: 6.9117


1000it [09:52,  1.69it/s]
1000it [04:17,  3.89it/s]


Epoch 8 train loss: 7.2565 val loss: 6.7364


1000it [09:52,  1.69it/s]
1000it [04:17,  3.89it/s]


Epoch 9 train loss: 7.1359 val loss: 6.6338


1000it [09:52,  1.69it/s]
1000it [04:17,  3.89it/s]

Epoch 10 train loss: 7.0725 val loss: 6.6056





Saving the fine tuned model

In [16]:
model.save_pretrained("./serving_models/fine_tuned_pegasus")
tokenizer.save_pretrained("./serving_models/fine_tuned_pegasus")

('./serving_models/fine_tuned_pegasus/tokenizer_config.json',
 './serving_models/fine_tuned_pegasus/special_tokens_map.json',
 './serving_models/fine_tuned_pegasus/spiece.model',
 './serving_models/fine_tuned_pegasus/added_tokens.json')

Loading the fine tuned model and tokenizer

In [17]:
%pwd

'/root/Git_Repo/NLP/42850-nlp/models/PEGASUS'

In [18]:
model = PegasusForConditionalGeneration.from_pretrained("./serving_models/fine_tuned_pegasus")
tokenizer = PegasusTokenizer.from_pretrained("./serving_models/fine_tuned_pegasus")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining the test dataset

In [19]:
test_dataset = SummarizationDataset(test_df, tokenizer)

Defining the test dataloader

In [20]:
test_loader = DataLoader(test_dataset, batch_size=1,collate_fn=collate_fn)

Generating predictions on the test dataset

In [21]:
len(test_loader)

100

In [22]:
model.to(device)
model.eval()
predictions = []
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output_ids = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, decoder_start_token_id=tokenizer.pad_token_id)
        batch_predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
        predictions.extend(batch_predictions)

100it [06:37,  3.98s/it]


In [23]:
len(predictions)

100

Writing predicted summary to a new test set

In [24]:
print(len(test_df))
test_df["predictions"] = predictions

100


In [27]:
test_df.to_csv("../../results/test_PEGASUS.csv", index=False)

Checking the generated summaries.

In [25]:
test_df

Unnamed: 0,id,article,highlights,predictions
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,Many economy seats on United Airlines have 30 ...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Intoxicated Rahul Kumar, 17, climbed into the ..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,New two-year deal to remain at Nottingham Fore...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,New Liverpool target Neto is also wanted by PS...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",Group of Jenner's step-children will be in Arm...
...,...,...,...,...
95,64ee7c9eb9f1efbb7da0ce80498434c623615b84,As Zlatan Ibrahimovic famously believes the Wo...,Zlatan Ibrahimovic will line up against former...,Settlement of the Champions League semi-final ...
96,5cf4682cd03238d5867027ce9492b626cd1ed011,"Jameela spent £3,000 on having all her amalgam...","Jameela Jamil, 29, is convinced dental work tr...",President of British Dental Association says t...
97,3815d19af18ff22be6ad6095722d7367bb7271af,A paramedic who pretended he was gay to get cl...,"Christopher Bridger, 25, attacked three women ...","Doctor Christopher Bridger, 25, sexually assau..."
98,fb207604ffa7e8371c622840445825db8993d4d2,"Paris Saint-Germain face Nice on Saturday, hop...",Paris Saint-Germain captain Thiago Silva suffe...,Group of key players are missing for Paris Sai...


Evaluating Performance via Rouge Metrics(ROUGE-1,ROUGE-2 and ROUGE-L)

In [26]:
# Calculate the ROUGE scores between the predicted summaries and the actual summaries
rouge = Rouge()
scores = rouge.get_scores(predictions, test_df["highlights"].tolist(), avg=True)

# Print the ROUGE scores
print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")

ROUGE-1: 0.3397
ROUGE-2: 0.1352
ROUGE-L: 0.3184
