In [None]:
# Install the necessary libraries
!pip install transformers datasets

import torch
import time
from torch.utils.data import Dataset, DataLoader
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, AdamW)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

# Load the dataset (replace with your dataset)
fake_df = pd.read_csv('/path/to/your/fake_news.csv')
true_df = pd.read_csv('/path/to/your/true_news.csv')

# Add labels (0 = fake, 1 = true)
fake_df['label'] = 0
true_df['label'] = 1

# Combine the datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Define the model to use: 'bert-base-uncased', 'roberta-base', or 'gpt2'
model_name = 'bert-base-uncased'  # Change this to 'roberta-base' or 'gpt2' for other models

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# If using GPT-2, set pad_token as eos_token since GPT-2 does not have a padding token
if 'gpt2' in model_name:
    tokenizer.pad_token = tokenizer.eos_token

# Define a custom dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):  # Adjust max_len if needed
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts.iloc[index]
        label = self.labels.iloc[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create Dataset and DataLoader for training and testing
train_dataset = FakeNewsDataset(X_train, y_train, tokenizer, max_len=128)
test_dataset = FakeNewsDataset(X_test, y_test, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Adjust batch size for memory
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Load the pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Set the pad_token_id in both the tokenizer and the model configuration if using GPT-2
if 'gpt2' in model_name:
    model.config.pad_token_id = tokenizer.eos_token_id

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Initialize the GradScaler for mixed precision training
scaler = GradScaler()

# Helper function to calculate evaluation metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, roc_auc

# Training function with mixed precision and gradient accumulation
def train(model, data_loader, optimizer, device, accumulation_steps=2):
    model.train()
    total_loss = 0

    optimizer.zero_grad()
    for i, batch in enumerate(tqdm(data_loader, desc="Training")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass with mixed precision
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Backward pass with gradient accumulation
        scaler.scale(loss / accumulation_steps).backward()

        # Step optimizer every 'accumulation_steps' batches
        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(data_loader)

# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels

# Collect results
results = []

# Training and evaluation loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Time the training process
    start_train_time = time.time()
    train_loss = train(model, train_loader, optimizer, device)
    train_time = time.time() - start_train_time
    print(f"Training loss: {train_loss}")

    # Evaluate on the test set
    start_inference_time = time.time()
    test_predictions, test_true_labels = evaluate(model, test_loader, device)
    inference_time = time.time() - start_inference_time

    # Calculate metrics
    test_acc, test_prec, test_rec, test_f1, test_roc_auc = calculate_metrics(test_true_labels, test_predictions)

    # Log parameter count
    num_params = sum(p.numel() for p in model.parameters())

    # Collect results
    results.append({
        'Epoch': epoch + 1,
        'Num Parameters': num_params,
        'Train Time (s)': train_time,
        'Inference Time (s)': inference_time,
        'Test Accuracy': test_acc,
        'Test Precision': test_prec,
        'Test Recall': test_rec,
        'Test F1': test_f1,
        'Test ROC-AUC': test_roc_auc
    })

# Convert results to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('/path/to/save/your_results.csv', index=False)

# Print the results
print(results_df)
