In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from transformers import BertTokenizer, BertForPreTraining, BertForSequenceClassification
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
# Load XLM-RoBERTa tokenizer and model
tokenizer = BertTokenizer.from_pretrained("robzchhangte/10-bert-uncased-datav4")
model = BertForSequenceClassification.from_pretrained("robzchhangte/10-bert-uncased-datav4", num_labels=2)

In [None]:
# Load your CSV file into a DataFrame
df = pd.read_csv('fake_reviews_dataset.csv')  # Replace 'your_dataset.csv' with the actual file path
df['label'] = df['label'].map({'CG': 0, 'OR': 1})

In [None]:
df.head()

In [None]:
df['label'].unique()

In [None]:
# Assuming your CSV has columns named 'article' and 'label'
texts = df['text_'].tolist()
labels = df['label'].tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Tokenize the texts
tokenized_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Assuming you have training and testing datasets: train_texts, train_labels, test_texts, test_labels

# Tokenize the texts
train_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
test_inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

In [None]:
# Create PyTorch datasets
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}, torch.tensor(self.labels[idx])

train_dataset = CustomDataset(train_inputs, train_labels)
test_dataset = CustomDataset(test_inputs, test_labels)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available CUDA devices
    num_devices = torch.cuda.device_count()
    
    # Get the device IDs
    device_ids = list(range(num_devices))
    
    print("Available GPU device IDs:", device_ids)
else:
    print("CUDA is not available. Using CPU.")


In [None]:
device_ids = [0, 1, 2, 3]
# model = torch.nn.DataParallel(model, device_ids=device_ids)

In [None]:
# Set up optimizer and loss function
# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_function = torch.nn.CrossEntropyLoss()

# Training loop with loss visualization
num_epochs = 3  # Adjust as needed
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.nn.DataParallel(model, device_ids=device_ids)
# model.to(device)

train_losses = []
test_accuracies = []

In [None]:
for epoch in range(num_epochs):
    model.train()
    epoch_train_losses = []

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/ {num_epochs}"):
        inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_function(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        epoch_train_losses.append(loss.item())

    avg_train_loss = np.mean(epoch_train_losses)
    train_losses.append(avg_train_loss)

    # Testing
    model.eval()
    test_preds = []
    test_true = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc=f"Epoch {epoch + 1}/ {num_epochs} - Testing"):
            inputs = {key: val.to(device) for key, val in batch[0].items()}
            labels = batch[1].to(device)

            outputs = model(**inputs)
            _, preds = torch.max(outputs.logits, dim=1)

            test_preds.extend(preds.cpu().numpy())
            test_true.extend(labels.cpu().numpy())

    accuracy = accuracy_score(test_true, test_preds)
    f1 = f1_score(test_true, test_preds, average='weighted')
    test_accuracies.append(accuracy)

    print(f"Epoch {epoch + 1}/{num_epochs} - Training Loss: {avg_train_loss:.4f}, Testing Accuracy: {accuracy:.4f}")

# Save the trained model if needed
# torch.save(model.state_dict(), 'xlm_roberta_classification_model.pth')

In [None]:
# Visualize the loss and accuracy
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), test_accuracies, label='Testing Accuracy')
plt.title('Testing Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
f1