In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
data = pd.read_csv('/content/drive/MyDrive/Reviews.csv')

data = data[['Text', 'Summary']].dropna()

data['Text'] = data['Text'].str.lower()
data['Summary'] = data['Summary'].str.lower()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [8]:
# Divide the dataset into training and testing (75:25)
data_used, waste_data = train_test_split(data, test_size=0.6, random_state=42)
train_data, test_data = train_test_split(data_used, test_size=0.25, random_state=42)

In [9]:
# Implement a custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=40):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        input_ids = self.tokenizer.encode(text, truncation=True, max_length=self.max_length, padding='max_length')
        labels = self.tokenizer.encode(summary, truncation=True, max_length=self.max_length, padding='max_length')

        return torch.tensor(input_ids), torch.tensor(labels)

# Initialize datasets and dataloaders
train_dataset = ReviewDataset(train_data['Text'], train_data['Summary'], tokenizer)
test_dataset = ReviewDataset(test_data['Text'], test_data['Summary'], tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [10]:
torch.cuda.empty_cache()

In [11]:
# Fine-tune the GPT-2 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 3
tokenizer.pad_token = tokenizer.eos_token

torch.cuda.empty_cache()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    c=0
    for input_ids, labels in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        c+=1
        if c%500==0:
            print(c)
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        torch.cuda.empty_cache()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_gpt2_model')


cuda


Epoch 1/3:   9%|▉         | 499/5329 [03:11<31:31,  2.55it/s]

500


Epoch 1/3:  19%|█▊        | 999/5329 [06:24<27:55,  2.58it/s]

1000


Epoch 1/3:  28%|██▊       | 1499/5329 [09:38<24:34,  2.60it/s]

1500


Epoch 1/3:  38%|███▊      | 1999/5329 [12:52<21:48,  2.54it/s]

2000


Epoch 1/3:  47%|████▋     | 2499/5329 [16:06<18:07,  2.60it/s]

2500


Epoch 1/3:  56%|█████▋    | 2999/5329 [19:20<15:03,  2.58it/s]

3000


Epoch 1/3:  66%|██████▌   | 3499/5329 [22:34<12:00,  2.54it/s]

3500


Epoch 1/3:  75%|███████▌  | 3999/5329 [25:48<08:27,  2.62it/s]

4000


Epoch 1/3:  84%|████████▍ | 4499/5329 [29:02<05:28,  2.53it/s]

4500


Epoch 1/3:  94%|█████████▍| 4999/5329 [32:15<02:07,  2.58it/s]

5000


Epoch 1/3: 100%|██████████| 5329/5329 [34:23<00:00,  2.58it/s]


Average training loss: 0.9952735262777014


Epoch 2/3:   9%|▉         | 499/5329 [03:13<30:56,  2.60it/s]

500


Epoch 2/3:  19%|█▊        | 999/5329 [06:26<28:24,  2.54it/s]

1000


Epoch 2/3:  28%|██▊       | 1499/5329 [09:40<24:27,  2.61it/s]

1500


Epoch 2/3:  38%|███▊      | 1999/5329 [12:54<21:16,  2.61it/s]

2000


Epoch 2/3:  47%|████▋     | 2499/5329 [16:07<18:35,  2.54it/s]

2500


Epoch 2/3:  56%|█████▋    | 2999/5329 [19:20<14:47,  2.62it/s]

3000


Epoch 2/3:  66%|██████▌   | 3499/5329 [22:34<11:46,  2.59it/s]

3500


Epoch 2/3:  75%|███████▌  | 3999/5329 [25:48<08:27,  2.62it/s]

4000


Epoch 2/3:  84%|████████▍ | 4499/5329 [29:01<05:16,  2.62it/s]

4500


Epoch 2/3:  94%|█████████▍| 4999/5329 [32:15<02:08,  2.56it/s]

5000


Epoch 2/3: 100%|██████████| 5329/5329 [34:23<00:00,  2.58it/s]


Average training loss: 0.9461870716937512


Epoch 3/3:   9%|▉         | 499/5329 [03:13<30:45,  2.62it/s]

500


Epoch 3/3:  19%|█▊        | 999/5329 [06:27<28:52,  2.50it/s]

1000


Epoch 3/3:  28%|██▊       | 1499/5329 [09:41<24:26,  2.61it/s]

1500


Epoch 3/3:  38%|███▊      | 1999/5329 [12:55<21:11,  2.62it/s]

2000


Epoch 3/3:  47%|████▋     | 2499/5329 [16:09<18:35,  2.54it/s]

2500


Epoch 3/3:  56%|█████▋    | 2999/5329 [19:23<14:52,  2.61it/s]

3000


Epoch 3/3:  66%|██████▌   | 3499/5329 [22:37<11:42,  2.61it/s]

3500


Epoch 3/3:  75%|███████▌  | 3999/5329 [25:51<08:58,  2.47it/s]

4000


Epoch 3/3:  84%|████████▍ | 4499/5329 [29:05<05:15,  2.63it/s]

4500


Epoch 3/3:  94%|█████████▍| 4999/5329 [32:19<02:06,  2.61it/s]

5000


Epoch 3/3: 100%|██████████| 5329/5329 [34:27<00:00,  2.58it/s]


Average training loss: 0.9356498283166826


In [12]:
model.save_pretrained('fine_tuned_gpt2_model_batch')

In [13]:
!pip install rouge
from rouge import Rouge

def calculate_rouge_scores(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores



Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [14]:
from transformers.utils import logging
logging.set_verbosity_error()
list_of_rouge = []

if torch.cuda.is_available():
    torch.cuda.empty_cache()

# for input_ids, labels in tqdm(test_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
for input_ids, labels in tqdm(test_dataloader):

    input_ids = input_ids.to(device)
    labels = labels.to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)
    pad_token_id = tokenizer.eos_token_id

    outputs = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=pad_token_id, max_new_tokens=10)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    labels_text = tokenizer.decode(labels[0], skip_special_tokens=True)
    roug_score = calculate_rouge_scores(output_text, labels_text)
    list_of_rouge.append(roug_score)
    torch.cuda.empty_cache()

100%|██████████| 1777/1777 [03:29<00:00,  8.49it/s]


In [15]:
final_rouge = {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}
for rg in list_of_rouge:
    final_rouge['rouge-1']['r']+=rg['rouge-1']['r']
    final_rouge['rouge-1']['p']+=rg['rouge-1']['p']
    final_rouge['rouge-1']['f']+=rg['rouge-1']['f']
    final_rouge['rouge-2']['r']+=rg['rouge-2']['r']
    final_rouge['rouge-2']['p']+=rg['rouge-2']['p']
    final_rouge['rouge-2']['f']+=rg['rouge-2']['f']
    final_rouge['rouge-l']['r']+=rg['rouge-l']['r']
    final_rouge['rouge-l']['p']+=rg['rouge-l']['p']
    final_rouge['rouge-l']['f']+=rg['rouge-l']['f']

final_rouge['rouge-1']['r']/=len(list_of_rouge)
final_rouge['rouge-1']['p']/=len(list_of_rouge)
final_rouge['rouge-1']['f']/=len(list_of_rouge)
final_rouge['rouge-2']['r']/=len(list_of_rouge)
final_rouge['rouge-2']['p']/=len(list_of_rouge)
final_rouge['rouge-2']['f']/=len(list_of_rouge)
final_rouge['rouge-l']['r']/=len(list_of_rouge)
final_rouge['rouge-l']['p']/=len(list_of_rouge)
final_rouge['rouge-l']['f']/=len(list_of_rouge)

print(final_rouge)

{'rouge-1': {'f': 0.08405307834185982, 'p': 0.051347044412385134, 'r': 0.3027681743296919}, 'rouge-2': {'f': 0.02033616108539318, 'p': 0.012603284370296461, 'r': 0.08554887596751025}, 'rouge-l': {'f': 0.07967384391155471, 'p': 0.04859775332820371, 'r': 0.2897710090147396}}
