In [1]:
!pip install transformers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the labeled dataset
df = pd.read_csv("filtered_reviews.csv")


In [3]:
import pandas as pd

# Load the labeled dataset
df = pd.read_csv("combined_shuffled_reviews.csv")

# Remove rows with text containing only leading spaces
df['text'] = df['text'].str.strip()  # Strip whitespace from text column
df = df[df['text'] != '']  # Remove rows where text becomes empty after stripping

# Confirm the changes
print(df.head())  # Check the first few rows to ensure rows with leading-space text are removed


                                                text  label
0  This product is incredible! It works perfectly...      0
1  I bought this phone for my daughter and she sa...      0
2  I love my phone! I have taken it all over the ...      1
3  The mobile is Ok but in the description said t...      1
4  I had to return because it was not compatible ...      0


In [12]:
print(len(df))

8068


In [4]:
# Split the dataset
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# Define the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]
        label = int(self.data.iloc[index]["label"])
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt", return_attention_mask=True)
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": label
        }

# Tokenization and DataLoader
max_length = 128  # Adjust as needed
train_dataset = CustomDataset(train_df, tokenizer, max_length)
val_dataset = CustomDataset(val_df, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [6]:
import torch
from transformers import RobertaForSequenceClassification, AdamW
import torch.nn as nn

# Initialize the RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10  # Adjust as needed
model_save_path = r'C:\Users\Fusion\Desktop\projectmain\checkpoints2\model_epoch_{}.pt'
torch.save(model.state_dict(), model_save_path.format(0))

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    
    for batch in train_loader:
        # Your training code here
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Save model checkpoint after each epoch
    model.save_pretrained(model_save_path.format(epoch + 1))
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss / len(train_loader)}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Training Loss: 0.37037395725100497
Epoch 2/10, Training Loss: 0.23322859325787365
Epoch 3/10, Training Loss: 0.16838579456607747
Epoch 4/10, Training Loss: 0.13022782371637623
Epoch 5/10, Training Loss: 0.09539358664677469
Epoch 6/10, Training Loss: 0.07920610303771544
Epoch 7/10, Training Loss: 0.07205143959810703
Epoch 8/10, Training Loss: 0.05169751124704628
Epoch 9/10, Training Loss: 0.05068056273718173
Epoch 10/10, Training Loss: 0.049644925160446475


In [7]:
model.eval()
val_loss = 0
all_labels = []
all_preds = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        val_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds)

accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=["ChatGPT", "Human"])
print(f"Validation Loss: {val_loss / len(val_loader)}")
print(f"Accuracy: {accuracy}")
print(report)


Validation Loss: 0.1936718777270462
Accuracy: 0.959727385377943
              precision    recall  f1-score   support

     ChatGPT       0.96      0.96      0.96       808
       Human       0.96      0.96      0.96       806

    accuracy                           0.96      1614
   macro avg       0.96      0.96      0.96      1614
weighted avg       0.96      0.96      0.96      1614



In [8]:
model.save_pretrained("fine_tuned_roberta1")
