In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from tqdm import tqdm

# File Path
dataset_path = "/content/total_hate_speech_rephrasing_dataset.csv"

# Load and Subset Dataset
df = pd.read_csv(dataset_path)
df.rename(columns={
    "Hateful Sentence": "input_text",
    "Neutral Rephrased Sentence": "target_text"
}, inplace=True)
df = df.head(100000)  # Use only the first 500 rows to speed up training

# Define Custom Dataset
class RephraseDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=64):  # Reduce max_length to 64
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = f"rephrase: {self.data.iloc[idx]['input_text']} ->"
        target_text = self.data.iloc[idx]['target_text']

        input_text = f"{source_text} {target_text}"

        encoded = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze()
        }

# Load Pre-trained GPT-2 Model and Tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Adjust GPT-2 for special tokens
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Prepare DataLoader
batch_size = 16  # Smaller batch size for faster iterations
dataset = RephraseDataset(df, tokenizer)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
epochs = 1  # Train for only 1 epoch to reduce time
gradient_accumulation_steps = 2  # Accumulate gradients over 2 steps
model.train()

for epoch in range(epochs):
    loop = tqdm(data_loader, leave=True)
    optimizer.zero_grad()

    for step, batch in enumerate(loop):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps  # Scale loss
        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0 or step == len(loop) - 1:
            optimizer.step()
            optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item() * gradient_accumulation_steps)

# Save the fine-tuned model
model_save_path = "/content/gpt2_fine_tuned_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Fine-tuned model saved at {model_save_path}")




OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 89.06 MiB is free. Process 6349 has 14.66 GiB memory in use. Of the allocated memory 14.28 GiB is allocated by PyTorch, and 256.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)