In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
data = pd.read_csv('/kaggle/input/Reviews.csv')

data = data[['Text', 'Summary']].dropna()

data['Text'] = data['Text'].str.lower()
data['Summary'] = data['Summary'].str.lower()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
# Divide the dataset into training and testing (75:25)
data_used, waste_data = train_test_split(data, test_size=0.6, random_state=42)
train_data, test_data = train_test_split(data_used, test_size=0.25, random_state=42)

In [4]:
# Implement a custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=40):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        input_ids = self.tokenizer.encode(text, truncation=True, max_length=self.max_length, padding='max_length')
        labels = self.tokenizer.encode(summary, truncation=True, max_length=self.max_length, padding='max_length')
        
        return torch.tensor(input_ids), torch.tensor(labels)

# Initialize datasets and dataloaders
train_dataset = ReviewDataset(train_data['Text'], train_data['Summary'], tokenizer)
test_dataset = ReviewDataset(test_data['Text'], test_data['Summary'], tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [5]:
torch.cuda.empty_cache()

In [6]:
# Fine-tune the GPT-2 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
epochs = 3
tokenizer.pad_token = tokenizer.eos_token

torch.cuda.empty_cache()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    c=0
    for input_ids, labels in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        c+=1
        if c%500==0:
            print(c)
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        torch.cuda.empty_cache()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_gpt2_model')


cuda


Epoch 1/3:   5%|▍         | 500/10658 [01:14<24:04,  7.03it/s]

500


Epoch 1/3:   9%|▉         | 1000/10658 [02:26<23:14,  6.93it/s]

1000


Epoch 1/3:  14%|█▍        | 1500/10658 [03:39<21:53,  6.97it/s]

1500


Epoch 1/3:  19%|█▉        | 2000/10658 [04:51<20:30,  7.04it/s]

2000


Epoch 1/3:  23%|██▎       | 2500/10658 [06:04<19:28,  6.98it/s]

2500


Epoch 1/3:  28%|██▊       | 3000/10658 [07:16<18:32,  6.88it/s]

3000


Epoch 1/3:  33%|███▎      | 3500/10658 [08:28<17:15,  6.91it/s]

3500


Epoch 1/3:  38%|███▊      | 4000/10658 [09:40<15:42,  7.07it/s]

4000


Epoch 1/3:  42%|████▏     | 4500/10658 [10:53<14:48,  6.93it/s]

4500


Epoch 1/3:  47%|████▋     | 5000/10658 [12:05<13:44,  6.86it/s]

5000


Epoch 1/3:  52%|█████▏    | 5500/10658 [13:17<12:25,  6.92it/s]

5500


Epoch 1/3:  56%|█████▋    | 6000/10658 [14:29<11:13,  6.91it/s]

6000


Epoch 1/3:  61%|██████    | 6500/10658 [15:41<10:05,  6.86it/s]

6500


Epoch 1/3:  66%|██████▌   | 7000/10658 [16:54<08:50,  6.89it/s]

7000


Epoch 1/3:  70%|███████   | 7500/10658 [18:06<07:29,  7.02it/s]

7500


Epoch 1/3:  75%|███████▌  | 8000/10658 [19:18<06:20,  6.98it/s]

8000


Epoch 1/3:  80%|███████▉  | 8500/10658 [20:30<05:13,  6.89it/s]

8500


Epoch 1/3:  84%|████████▍ | 9000/10658 [21:42<03:57,  6.97it/s]

9000


Epoch 1/3:  89%|████████▉ | 9500/10658 [22:55<02:46,  6.94it/s]

9500


Epoch 1/3:  94%|█████████▍| 10000/10658 [24:07<01:35,  6.92it/s]

10000


Epoch 1/3:  99%|█████████▊| 10500/10658 [25:19<00:22,  6.97it/s]

10500


Epoch 1/3: 100%|██████████| 10658/10658 [25:42<00:00,  6.91it/s]


Average training loss: 0.9591378422101324


Epoch 2/3:   5%|▍         | 500/10658 [01:11<24:12,  6.99it/s]

500


Epoch 2/3:   9%|▉         | 1000/10658 [02:23<22:51,  7.04it/s]

1000


Epoch 2/3:  14%|█▍        | 1500/10658 [03:34<21:41,  7.04it/s]

1500


Epoch 2/3:  19%|█▉        | 2000/10658 [04:47<20:27,  7.05it/s]

2000


Epoch 2/3:  23%|██▎       | 2500/10658 [05:58<19:22,  7.02it/s]

2500


Epoch 2/3:  28%|██▊       | 3000/10658 [07:10<18:33,  6.88it/s]

3000


Epoch 2/3:  33%|███▎      | 3500/10658 [08:21<17:14,  6.92it/s]

3500


Epoch 2/3:  38%|███▊      | 4000/10658 [09:33<15:38,  7.10it/s]

4000


Epoch 2/3:  42%|████▏     | 4500/10658 [10:45<14:51,  6.91it/s]

4500


Epoch 2/3:  47%|████▋     | 5000/10658 [11:56<13:20,  7.07it/s]

5000


Epoch 2/3:  52%|█████▏    | 5500/10658 [13:08<12:09,  7.07it/s]

5500


Epoch 2/3:  56%|█████▋    | 6000/10658 [14:20<11:07,  6.98it/s]

6000


Epoch 2/3:  61%|██████    | 6500/10658 [15:31<10:07,  6.85it/s]

6500


Epoch 2/3:  66%|██████▌   | 7000/10658 [16:43<08:40,  7.02it/s]

7000


Epoch 2/3:  70%|███████   | 7500/10658 [17:55<07:43,  6.82it/s]

7500


Epoch 2/3:  75%|███████▌  | 8000/10658 [19:07<06:16,  7.06it/s]

8000


Epoch 2/3:  80%|███████▉  | 8500/10658 [20:19<05:06,  7.04it/s]

8500


Epoch 2/3:  84%|████████▍ | 9000/10658 [21:30<04:03,  6.80it/s]

9000


Epoch 2/3:  89%|████████▉ | 9500/10658 [22:41<02:45,  6.99it/s]

9500


Epoch 2/3:  94%|█████████▍| 10000/10658 [23:53<01:35,  6.89it/s]

10000


Epoch 2/3:  99%|█████████▊| 10500/10658 [25:04<00:22,  6.94it/s]

10500


Epoch 2/3: 100%|██████████| 10658/10658 [25:27<00:00,  6.98it/s]


Average training loss: 0.9255770837517969


Epoch 3/3:   5%|▍         | 500/10658 [01:11<24:32,  6.90it/s]

500


Epoch 3/3:   9%|▉         | 1000/10658 [02:24<24:51,  6.48it/s]

1000


Epoch 3/3:  14%|█▍        | 1500/10658 [03:35<21:49,  6.99it/s]

1500


Epoch 3/3:  19%|█▉        | 2000/10658 [04:47<20:51,  6.92it/s]

2000


Epoch 3/3:  23%|██▎       | 2500/10658 [05:58<19:20,  7.03it/s]

2500


Epoch 3/3:  28%|██▊       | 3000/10658 [07:10<18:23,  6.94it/s]

3000


Epoch 3/3:  33%|███▎      | 3500/10658 [08:21<17:11,  6.94it/s]

3500


Epoch 3/3:  38%|███▊      | 4000/10658 [09:32<15:47,  7.03it/s]

4000


Epoch 3/3:  42%|████▏     | 4500/10658 [10:44<14:32,  7.06it/s]

4500


Epoch 3/3:  47%|████▋     | 5000/10658 [11:55<13:08,  7.17it/s]

5000


Epoch 3/3:  52%|█████▏    | 5500/10658 [13:06<12:06,  7.10it/s]

5500


Epoch 3/3:  56%|█████▋    | 6000/10658 [14:18<11:06,  6.99it/s]

6000


Epoch 3/3:  61%|██████    | 6500/10658 [15:29<09:59,  6.94it/s]

6500


Epoch 3/3:  66%|██████▌   | 7000/10658 [16:40<08:48,  6.93it/s]

7000


Epoch 3/3:  70%|███████   | 7500/10658 [17:52<07:34,  6.94it/s]

7500


Epoch 3/3:  75%|███████▌  | 8000/10658 [19:03<06:16,  7.06it/s]

8000


Epoch 3/3:  80%|███████▉  | 8500/10658 [20:14<05:07,  7.01it/s]

8500


Epoch 3/3:  84%|████████▍ | 9000/10658 [21:26<03:53,  7.11it/s]

9000


Epoch 3/3:  89%|████████▉ | 9500/10658 [22:37<02:47,  6.91it/s]

9500


Epoch 3/3:  94%|█████████▍| 10000/10658 [23:49<01:33,  7.06it/s]

10000


Epoch 3/3:  99%|█████████▊| 10500/10658 [25:00<00:22,  6.97it/s]

10500


Epoch 3/3: 100%|██████████| 10658/10658 [25:23<00:00,  7.00it/s]


Average training loss: 0.9055519239537643


In [7]:
model.save_pretrained('fine_tuned_gpt2_model_lr')

In [8]:
!pip install rouge
from rouge import Rouge

def calculate_rouge_scores(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [9]:
from transformers.utils import logging
logging.set_verbosity_error()
list_of_rouge = []

if torch.cuda.is_available():
    torch.cuda.empty_cache()

# for input_ids, labels in tqdm(test_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
for input_ids, labels in tqdm(test_dataloader):
        
    input_ids = input_ids.to(device)
    labels = labels.to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)
    pad_token_id = tokenizer.eos_token_id
    
    outputs = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=pad_token_id, max_new_tokens=10)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    labels_text = tokenizer.decode(labels[0], skip_special_tokens=True)
    roug_score = calculate_rouge_scores(output_text, labels_text)
    list_of_rouge.append(roug_score)
    torch.cuda.empty_cache()

100%|██████████| 3553/3553 [03:03<00:00, 19.31it/s]


In [10]:
final_rouge = {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}
for rg in list_of_rouge:
    final_rouge['rouge-1']['r']+=rg['rouge-1']['r']
    final_rouge['rouge-1']['p']+=rg['rouge-1']['p']
    final_rouge['rouge-1']['f']+=rg['rouge-1']['f']
    final_rouge['rouge-2']['r']+=rg['rouge-2']['r']
    final_rouge['rouge-2']['p']+=rg['rouge-2']['p']
    final_rouge['rouge-2']['f']+=rg['rouge-2']['f']
    final_rouge['rouge-l']['r']+=rg['rouge-l']['r']
    final_rouge['rouge-l']['p']+=rg['rouge-l']['p']
    final_rouge['rouge-l']['f']+=rg['rouge-l']['f']

final_rouge['rouge-1']['r']/=len(list_of_rouge)
final_rouge['rouge-1']['p']/=len(list_of_rouge)
final_rouge['rouge-1']['f']/=len(list_of_rouge)
final_rouge['rouge-2']['r']/=len(list_of_rouge)
final_rouge['rouge-2']['p']/=len(list_of_rouge)
final_rouge['rouge-2']['f']/=len(list_of_rouge)
final_rouge['rouge-l']['r']/=len(list_of_rouge)
final_rouge['rouge-l']['p']/=len(list_of_rouge)
final_rouge['rouge-l']['f']/=len(list_of_rouge)

print(final_rouge)

{'rouge-1': {'f': 0.0841702417285644, 'p': 0.051084033100745045, 'r': 0.3079353399021223}, 'rouge-2': {'f': 0.020264212268463918, 'p': 0.012273728389982757, 'r': 0.0889187865602414}, 'rouge-l': {'f': 0.07986952382854193, 'p': 0.04838538159016817, 'r': 0.2949689895110542}}
