In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.2-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.2


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn as graphnn
from sklearn.metrics import f1_score
from torch_geometric.loader import DataLoader
from torch_geometric.utils import scatter
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold
from itertools import product

In [3]:
from torch_geometric.datasets import TUDataset

dataset_en = TUDataset(root='', name='ENZYMES',use_node_attr = True)
dataset_rd = TUDataset(root='', name='REDDIT-BINARY')
dataset_pr = TUDataset(root='', name='PROTEINS')

Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/REDDIT-BINARY.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip
Processing...
Done!


In [4]:
print(len(dataset_en))
print(len(dataset_rd))
print(len(dataset_pr))

print(dataset_en.num_classes)
print(dataset_rd.num_classes)
print(dataset_pr.num_classes)

print(dataset_en.num_node_features)
print(dataset_rd.num_node_features)
print(dataset_pr.num_node_features)

600
2000
1113
6
2
2
21
0
3


In [5]:
from torch_geometric.loader import DataLoader
train_en = dataset_en[:int(len(dataset_en)*0.8)]
val_en = dataset_en[int(len(dataset_en)*0.8):int(len(dataset_en)*0.9)]
test_en = dataset_en[int(len(dataset_en)*0.9):]

train_en_loader = DataLoader(train_en, batch_size=32, shuffle=True)
val_en_loader = DataLoader(val_en, batch_size=32, shuffle=False)
test_en_loader = DataLoader(test_en, batch_size=32, shuffle=False)

In [36]:
class MLP1(torch.nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(MLP1, self).__init__()
        self.fc_global = nn.Linear(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, data):
      x = self.fc_global(data.x)
      x = self.relu(x)
      x = self.fc(x)
      x = graphnn.global_mean_pool(x, data.batch)
      return x

In [34]:
class MLP1(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP1, self).__init__()

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, data):
        x = self.fc1(data.x)
        x = graphnn.global_add_pool(x, data.batch)
        x = self.fc2(x)
        return x

In [16]:
def train(model, loss_fcn, device, optimizer, max_epochs, train_dataloader, val_dataloader, patience=30):
    best_val_score = 0
    patience_counter = 0
    metrics_history = {'train_loss': [], 'val_loss': [], 'f1_micro': [], 'f1_macro': [], 'accuracy': [], 'best_score':[]}

    for epoch in range(max_epochs):
        model.train()
        train_losses = []
        for batch in train_dataloader:
            if batch.x is None:
                raise ValueError("Node features are missing. Ensure data.x is correctly set.")
            batch = batch.to(device)
            optimizer.zero_grad()
            logits = model(batch)
            loss = loss_fcn(logits, batch.y)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        val_loss = evaluate_loss(model, loss_fcn, device, val_dataloader)
        f1_micro, f1_macro, accuracy = evaluate_metrics(model, device, val_dataloader)

        # Save metrics
        metrics_history['train_loss'].append(np.mean(train_losses))
        metrics_history['val_loss'].append(val_loss)
        metrics_history['f1_micro'].append(f1_micro)
        metrics_history['f1_macro'].append(f1_macro)
        metrics_history['accuracy'].append(accuracy)

        print(f"Epoch {epoch+1}, Train Loss: {np.mean(train_losses):.4f}, Val Loss: {val_loss:.4f}, F1 Micro: {f1_micro:.4f}, F1 Macro: {f1_macro:.4f}, Accuracy: {accuracy:.4f}")

        # Early stopping logic using f1_micro score
        if f1_micro > best_val_score:
            best_val_score = f1_micro
            metrics_history['best_score'] = best_val_score
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    return metrics_history

In [17]:
def evaluate_loss(model, loss_fcn, device, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            outputs = model(batch)
            loss = loss_fcn(outputs, batch.y)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [18]:
def evaluate_metrics(model, device, dataloader):
    model.eval()
    total_preds = []
    total_targets = []

    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            outputs = model(batch)
            _, predicted = torch.max(outputs, 1)
            total_preds.extend(predicted.view(-1).cpu().numpy())
            total_targets.extend(batch.y.view(-1).cpu().numpy())

    f1_micro = f1_score(total_targets, total_preds, average='micro')
    f1_macro = f1_score(total_targets, total_preds, average='macro')
    accuracy = accuracy_score(total_targets, total_preds)
    return f1_micro, f1_macro, accuracy

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("\nDevice: ", device)

### Max number of epochs
max_epochs = 500
n_features = dataset_en.num_node_features
n_classes = dataset_en.num_classes


Device:  cuda


In [None]:
baseline = MLP1(
    input_size=n_features, hidden_size=256, output_size=n_classes
).to(device)

### DEFINE LOSS FUNCTION
loss_fcn = nn.CrossEntropyLoss()
### DEFINE OPTIMIZER
optimizer = torch.optim.Adam(baseline.parameters(), lr=0.005)

### TRAIN THE MODEL
metrics_history=train(
    baseline,
    loss_fcn,
    device,
    optimizer,
    max_epochs,
    train_en_loader,
    val_en_loader,
)

Epoch 1, Train Loss: 222.7368, Val Loss: 956.3276, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 2, Train Loss: 97.3466, Val Loss: 833.8297, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 3, Train Loss: 56.3178, Val Loss: 782.7109, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 4, Train Loss: 58.0872, Val Loss: 847.0623, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 5, Train Loss: 54.6008, Val Loss: 755.0434, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 6, Train Loss: 23.8182, Val Loss: 677.7715, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 7, Train Loss: 33.5637, Val Loss: 657.1805, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 8, Train Loss: 39.5702, Val Loss: 639.0893, F1 Micro: 0.1500, F1 Macro: 0.1364, Accuracy: 0.1500
Epoch 9, Train Loss: 27.9771, Val Loss: 581.0397, F1 Micro: 0.0000, F1 Macro: 0.0000, Accuracy: 0.0000
Epoch 10, Train Loss: 15.2881, Val Loss: 552.8737, F1 Micro: 0.1000, F1 

In [20]:
def plot_metrics(metrics_history):
    epochs = range(1, len(metrics_history['train_loss']) + 1)

    plt.figure(figsize=(14, 10))

    plt.subplot(2, 2, 1)
    plt.plot(epochs, metrics_history['train_loss'], label='Train Loss')
    plt.plot(epochs, metrics_history['val_loss'], label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(2, 2, 2)
    plt.plot(epochs, metrics_history['f1_micro'], label='F1 Score (Micro)')
    plt.plot(epochs, metrics_history['f1_macro'], label='F1 Score (Macro)')
    plt.title('F1 Scores')
    plt.xlabel('Epochs')
    plt.ylabel('F1 Score')
    plt.legend()

    plt.subplot(2, 2, 3)
    plt.plot(epochs, metrics_history['accuracy'], label='Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [37]:
from sklearn.model_selection import KFold
from itertools import product

# Outer k-fold cross-validation setup
outer_k_folds = 2
inner_k_folds = 2
num_epochs = 500

# Possible hyperparameters to tune
learning_rates = [0.01, 0.001]
batch_sizes = [8, 16]
patiences = [500]

# Set list to store the evaluation metrics
f1_micro_test_list = []
f1_macro_test_list = []
accuracy_test_list = []

# Prepare the outer k-fold cross-validation
outer_kf = KFold(n_splits=outer_k_folds, shuffle=True, random_state=42)

# Loop over each fold for the outer k-fold
for fold, (train_val_idx, test_idx) in enumerate(outer_kf.split(dataset_en)):
    print(f"Outer FOLD {fold}")
    print("--------------------------------")

    # Split dataset into train_val and test for the current outer fold
    train_val_dataset = dataset_en[train_val_idx]
    test_dataset = dataset_en[test_idx]

    # Initialize the best hyperparameter set and its performance score
    best_hyperparams = None
    best_score = 0

    # Inner k-fold cross-validation for hyperparameter tuning
    inner_kf = KFold(n_splits=inner_k_folds, shuffle=True, random_state=42)

    # Create all combinations of hyperparameters
    all_params = list(product(learning_rates, batch_sizes, patiences))

    # Loop over all combinations of hyperparameters
    for params in all_params:
        lr, batch_size, patience = params
        inner_scores = []

        # Perform inner k-fold cross-validation
        for inner_fold, (inner_train_idx, inner_val_idx) in enumerate(inner_kf.split(train_val_dataset)):
            print(f"Inner FOLD {inner_fold}")
            print(f"Hyperparameters: LR={lr}, Batch Size={batch_size}, Patience={patience}")

            # Split dataset into inner train and validation sets
            inner_train_dataset = train_val_dataset[inner_train_idx]
            inner_val_dataset = train_val_dataset[inner_val_idx]

            # Define train and validation dataloaders for the current inner fold
            inner_train_loader = DataLoader(inner_train_dataset, batch_size=batch_size, shuffle=True)
            inner_val_loader = DataLoader(inner_val_dataset, batch_size=batch_size, shuffle=False)

            # Initialize model and optimizer for the current inner fold
            model = MLP1(
                input_size=dataset_en.num_node_features,
                hidden_size=256,
                output_size=dataset_en.num_classes
            ).to(device)

            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            loss_fcn = torch.nn.CrossEntropyLoss()

            # Train the model for the current inner fold
            inner_metrics = train(model, loss_fcn, device, optimizer, num_epochs, inner_train_loader, inner_val_loader, patience)

            # Evaluate model performance, e.g., using validation F1 score
            # Save the model performance score for the current hyperparameter combination
            inner_scores.append(inner_metrics['best_score'])

        # Calculate the average performance over all inner folds for the current hyperparameter set
        average_score = np.mean(inner_scores)
        print(f"Average Score for hyperparameters {params}: {average_score}")

        # If the current hyperparameters outperform the previous ones, update the best_hyperparams
        if average_score > best_score:
            best_hyperparams = params
            best_score = average_score

    print(f"Best hyperparameters for Outer FOLD {fold}: {best_hyperparams} with score {best_score}")

    # Now retrain the model on the full train_val_dataset with the best_hyperparams

    # Extract best hyperparameters
    best_lr, best_batch_size, best_patience = best_hyperparams

    # DataLoader for the combined training and validation set
    train_val_loader = DataLoader(train_val_dataset, batch_size=best_batch_size, shuffle=True)

    # DataLoader for the test set
    test_loader = DataLoader(test_dataset, batch_size=best_batch_size, shuffle=False)

    # Initialize the model with the best hyperparameters
    model = MLP1(
        input_size=dataset_en.num_node_features,
        hidden_size=256,
        output_size=dataset_en.num_classes
    ).to(device)

    # Initialize the optimizer with the best learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=best_lr)

    # Loss function
    loss_fcn = torch.nn.CrossEntropyLoss()

    # Retrain the model on the full train_val_dataset
    retrained_metrics = train(
        model,
        loss_fcn,
        device,
        optimizer,
        num_epochs,
        train_val_loader,
        test_loader,  # We're using the test_loader here to monitor the performance, but we do not use this for making decisions
        best_patience
    )

    # After retraining, evaluate on the test set
    f1_micro_test, f1_macro_test, accuracy_test = evaluate_metrics(model, device, test_loader)
    print(f"Test set evaluation - F1 Micro: {f1_micro_test:.4f}, F1 Macro: {f1_macro_test:.4f}, Accuracy: {accuracy_test:.4f}")
    f1_micro_test_list.append(f1_micro_test)
    f1_macro_test_list.append(f1_macro_test)
    accuracy_test_list.append(accuracy_test)
    # Optionally, save your retrained model
    torch.save(model.state_dict(), f'Basic_model_fold_{fold}.pth')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 26, Train Loss: 1.5857, Val Loss: 1.7741, F1 Micro: 0.2667, F1 Macro: 0.2525, Accuracy: 0.2667
Epoch 27, Train Loss: 1.5740, Val Loss: 1.8044, F1 Micro: 0.2600, F1 Macro: 0.2316, Accuracy: 0.2600
Epoch 28, Train Loss: 1.5613, Val Loss: 1.8102, F1 Micro: 0.2400, F1 Macro: 0.1872, Accuracy: 0.2400
Epoch 29, Train Loss: 1.5952, Val Loss: 1.7346, F1 Micro: 0.2533, F1 Macro: 0.2197, Accuracy: 0.2533
Epoch 30, Train Loss: 1.5544, Val Loss: 1.7548, F1 Micro: 0.2933, F1 Macro: 0.2392, Accuracy: 0.2933
Epoch 31, Train Loss: 1.5626, Val Loss: 1.7575, F1 Micro: 0.2667, F1 Macro: 0.2500, Accuracy: 0.2667
Epoch 32, Train Loss: 1.5711, Val Loss: 2.0149, F1 Micro: 0.2600, F1 Macro: 0.2043, Accuracy: 0.2600
Epoch 33, Train Loss: 1.6006, Val Loss: 1.8293, F1 Micro: 0.2433, F1 Macro: 0.2098, Accuracy: 0.2433
Epoch 34, Train Loss: 1.5645, Val Loss: 1.8483, F1 Micro: 0.2433, F1 Macro: 0.2133, Accuracy: 0.2433
Epoch 35, Train Loss: 1.56

In [38]:
print(np.mean(f1_micro_test_list))
print(np.mean(f1_macro_test_list))
print(np.mean(accuracy_test_list))

0.5016666666666667
0.5022956600878323
0.5016666666666667
