In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
data = pd.read_csv('Reviews.csv')

data = data[['Text', 'Summary']].dropna()

data['Text'] = data['Text'].str.lower()
data['Summary'] = data['Summary'].str.lower()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [3]:
# Divide the dataset into training and testing (75:25)
data_used, waste_data = train_test_split(data, test_size=0.6, random_state=42)
train_data, test_data = train_test_split(data_used, test_size=0.25, random_state=42)

In [7]:
# Implement a custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=40):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        input_ids = self.tokenizer.encode(text, truncation=True, max_length=self.max_length, padding='max_length')
        labels = self.tokenizer.encode(summary, truncation=True, max_length=self.max_length, padding='max_length')
        
        return torch.tensor(input_ids), torch.tensor(labels)

# Initialize datasets and dataloaders
train_dataset = ReviewDataset(train_data['Text'], train_data['Summary'], tokenizer)
test_dataset = ReviewDataset(test_data['Text'], test_data['Summary'], tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [8]:
torch.cuda.empty_cache()

In [9]:
# Fine-tune the GPT-2 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 1
tokenizer.pad_token = tokenizer.eos_token

torch.cuda.empty_cache()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    c=0
    for input_ids, labels in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        c+=1
        if c%500==0:
            print(c)
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        torch.cuda.empty_cache()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_gpt2_model')


cuda


Epoch 1/1:   0%|          | 0/10658 [00:00<?, ?it/s]

Epoch 1/1:   5%|▍         | 499/10658 [02:13<45:22,  3.73it/s]

500


Epoch 1/1:   9%|▉         | 999/10658 [04:28<43:21,  3.71it/s]

1000


Epoch 1/1:  14%|█▍        | 1499/10658 [06:43<40:22,  3.78it/s]

1500


Epoch 1/1:  19%|█▉        | 1999/10658 [08:57<38:12,  3.78it/s]

2000


Epoch 1/1:  23%|██▎       | 2499/10658 [11:11<36:54,  3.68it/s]

2500


Epoch 1/1:  28%|██▊       | 2999/10658 [13:24<34:02,  3.75it/s]

3000


Epoch 1/1:  33%|███▎      | 3499/10658 [15:37<31:39,  3.77it/s]

3500


Epoch 1/1:  38%|███▊      | 3999/10658 [17:50<31:25,  3.53it/s]

4000


Epoch 1/1:  42%|████▏     | 4499/10658 [20:03<27:08,  3.78it/s]

4500


Epoch 1/1:  47%|████▋     | 4999/10658 [22:16<24:54,  3.79it/s]

5000


Epoch 1/1:  52%|█████▏    | 5499/10658 [24:28<22:46,  3.77it/s]

5500


Epoch 1/1:  56%|█████▋    | 5999/10658 [26:41<20:24,  3.80it/s]

6000


Epoch 1/1:  61%|██████    | 6499/10658 [28:53<18:18,  3.79it/s]

6500


Epoch 1/1:  66%|██████▌   | 6999/10658 [31:06<15:57,  3.82it/s]

7000


Epoch 1/1:  70%|███████   | 7499/10658 [33:18<13:51,  3.80it/s]

7500


Epoch 1/1:  75%|███████▌  | 7999/10658 [35:30<12:38,  3.51it/s]

8000


Epoch 1/1:  80%|███████▉  | 8499/10658 [37:43<09:28,  3.80it/s]

8500


Epoch 1/1:  84%|████████▍ | 8999/10658 [39:56<07:20,  3.77it/s]

9000


Epoch 1/1:  89%|████████▉ | 9499/10658 [42:08<05:09,  3.74it/s]

9500


Epoch 1/1:  94%|█████████▍| 9999/10658 [44:20<02:55,  3.76it/s]

10000


Epoch 1/1:  99%|█████████▊| 10499/10658 [46:34<00:43,  3.69it/s]

10500


Epoch 1/1: 100%|██████████| 10658/10658 [47:16<00:00,  3.76it/s]


Average training loss: 0.9562280519015571


In [10]:
model.save_pretrained('fine_tuned_gpt2_model_epoch')

In [11]:
!pip install rouge
from rouge import Rouge

def calculate_rouge_scores(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores



In [12]:
from transformers.utils import logging
logging.set_verbosity_error()
list_of_rouge = []

if torch.cuda.is_available():
    torch.cuda.empty_cache()

# for input_ids, labels in tqdm(test_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
for input_ids, labels in tqdm(test_dataloader):
        
    input_ids = input_ids.to(device)
    labels = labels.to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)
    pad_token_id = tokenizer.eos_token_id
    
    outputs = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=pad_token_id, max_new_tokens=10)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    labels_text = tokenizer.decode(labels[0], skip_special_tokens=True)
    roug_score = calculate_rouge_scores(output_text, labels_text)
    list_of_rouge.append(roug_score)
    torch.cuda.empty_cache()

100%|██████████| 3553/3553 [04:42<00:00, 12.58it/s]


In [13]:
final_rouge = {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}
for rg in list_of_rouge:
    final_rouge['rouge-1']['r']+=rg['rouge-1']['r']
    final_rouge['rouge-1']['p']+=rg['rouge-1']['p']
    final_rouge['rouge-1']['f']+=rg['rouge-1']['f']
    final_rouge['rouge-2']['r']+=rg['rouge-2']['r']
    final_rouge['rouge-2']['p']+=rg['rouge-2']['p']
    final_rouge['rouge-2']['f']+=rg['rouge-2']['f']
    final_rouge['rouge-l']['r']+=rg['rouge-l']['r']
    final_rouge['rouge-l']['p']+=rg['rouge-l']['p']
    final_rouge['rouge-l']['f']+=rg['rouge-l']['f']

final_rouge['rouge-1']['r']/=len(list_of_rouge)
final_rouge['rouge-1']['p']/=len(list_of_rouge)
final_rouge['rouge-1']['f']/=len(list_of_rouge)
final_rouge['rouge-2']['r']/=len(list_of_rouge)
final_rouge['rouge-2']['p']/=len(list_of_rouge)
final_rouge['rouge-2']['f']/=len(list_of_rouge)
final_rouge['rouge-l']['r']/=len(list_of_rouge)
final_rouge['rouge-l']['p']/=len(list_of_rouge)
final_rouge['rouge-l']['f']/=len(list_of_rouge)

print(final_rouge)

{'rouge-1': {'f': 0.0841702417285644, 'p': 0.051084033100745045, 'r': 0.3079353399021223}, 'rouge-2': {'f': 0.020264212268463918, 'p': 0.012273728389982757, 'r': 0.0889187865602414}, 'rouge-l': {'f': 0.07986952382854193, 'p': 0.04838538159016817, 'r': 0.2949689895110542}}
