In [None]:
# Install necessary libraries
!pip install torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv

import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, GINConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load your dataset (replace with your dataset path)
fake_df = pd.read_csv('/path/to/your/fake_news.csv')
true_df = pd.read_csv('/path/to/your/true_news.csv')

# Add labels (0 = fake, 1 = true)
fake_df['label'] = 0
true_df['label'] = 1

# Combine the datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Here you will need to define how you construct the graph,
# typically using TF-IDF or other features to build adjacency matrices for nodes (texts/articles)
# This step assumes you have already constructed node features and edge indices for your graph
# For illustration purposes, we will create some dummy features and edge indices
# Replace this with actual graph construction logic.

num_nodes = len(df)
node_features = np.random.rand(num_nodes, 128)  # Example feature vector for each node (128-dim)
edge_index = np.random.randint(0, num_nodes, (2, num_nodes * 5))  # Example random edges
labels = df['label'].values

# Split into train and test sets
train_idx, test_idx = train_test_split(np.arange(num_nodes), test_size=0.2, random_state=42)
train_labels, test_labels = labels[train_idx], labels[test_idx]

# Create PyTorch geometric Data objects for train and test sets
train_data = Data(x=torch.tensor(node_features[train_idx], dtype=torch.float),
                  edge_index=torch.tensor(edge_index, dtype=torch.long),
                  y=torch.tensor(train_labels, dtype=torch.long))

test_data = Data(x=torch.tensor(node_features[test_idx], dtype=torch.float),
                 edge_index=torch.tensor(edge_index, dtype=torch.long),
                 y=torch.tensor(test_labels, dtype=torch.long))

train_loader = DataLoader([train_data], batch_size=1, shuffle=True)
test_loader = DataLoader([test_data], batch_size=1, shuffle=False)

# Define the GNN model class (supports GCN, GraphSAGE, GAT, GIN)
class GNN(torch.nn.Module):
    def __init__(self, model_type='gcn', input_dim=128, hidden_dim=64, output_dim=2):
        super(GNN, self).__init__()

        if model_type == 'gcn':
            self.conv1 = GCNConv(input_dim, hidden_dim)
            self.conv2 = GCNConv(hidden_dim, hidden_dim)
        elif model_type == 'graphsage':
            self.conv1 = SAGEConv(input_dim, hidden_dim)
            self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        elif model_type == 'gat':
            self.conv1 = GATConv(input_dim, hidden_dim, heads=8)
            self.conv2 = GATConv(hidden_dim * 8, hidden_dim)
        elif model_type == 'gin':
            self.conv1 = GINConv(torch.nn.Sequential(torch.nn.Linear(input_dim, hidden_dim),
                                                     torch.nn.ReLU(),
                                                     torch.nn.Linear(hidden_dim, hidden_dim)))
            self.conv2 = GINConv(torch.nn.Sequential(torch.nn.Linear(hidden_dim, hidden_dim),
                                                     torch.nn.ReLU(),
                                                     torch.nn.Linear(hidden_dim, hidden_dim)))

        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Apply the first graph convolution layer and activation function
        x = F.relu(self.conv1(x, edge_index))

        # Apply the second graph convolution layer and activation function
        x = F.relu(self.conv2(x, edge_index))

        # Apply global mean pooling to get a fixed-size representation
        x = torch.mean(x, dim=0, keepdim=True)

        # Pass the pooled representation to the final fully connected layer
        out = self.fc(x)

        return out

# Choose the GNN model type: 'gcn', 'graphsage', 'gat', or 'gin'
model_type = 'gcn'  # Change this to 'graphsage', 'gat', or 'gin'

# Instantiate the GNN model
model = GNN(model_type=model_type, input_dim=128, hidden_dim=64, output_dim=2)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set up optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Helper function to calculate evaluation metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, roc_auc

# Training function
def train(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for data in tqdm(data_loader, desc="Training"):
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = F.cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for data in tqdm(data_loader, desc="Evaluating"):
            data = data.to(device)
            out = model(data)
            pred = out.argmax(dim=1).cpu().numpy()
            predictions.extend(pred)
            true_labels.extend(data.y.cpu().numpy())
    return predictions, true_labels

# Collect results
results = []

# Training and evaluation loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Training
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Training loss: {train_loss}")

    # Evaluation
    test_predictions, test_true_labels = evaluate(model, test_loader, device)

    # Calculate metrics
    test_acc, test_prec, test_rec, test_f1, test_roc_auc = calculate_metrics(test_true_labels, test_predictions)

    # Log parameter count
    num_params = sum(p.numel() for p in model.parameters())

    # Collect results
    results.append({
        'Epoch': epoch + 1,
        'Num Parameters': num_params,
        'Test Accuracy': test_acc,
        'Test Precision': test_prec,
        'Test Recall': test_rec,
        'Test F1': test_f1,
        'Test ROC-AUC': test_roc_auc
    })

# Convert results to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('/path/to/save/your_gnn_results.csv', index=False)

# Print the results
print(results_df)
