This file was created for running pytorch learning with CUDA.

## Documentation
+ [ModuleList](https://docs.pytorch.org/docs/stable/generated/torch.nn.ModuleList.html)

In [None]:
# SECTION: Define-GNN

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
import torch.optim as optim
import numpy as np
import random
import matplotlib.pyplot as plt
import numpy as np

class model(nn.Module):
    def __init__(self,  hidden_channels=64):
        super().__init__()
        self.conv1 = GraphConv(1, hidden_channels) 
        self.conv2 = GraphConv(hidden_channels, 1)
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weights
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight)
        x = torch.sigmoid(x)  # Outputs between 0-1
        return x  # [num_nodes]


def seed_fn(seed=42):
    # Set ALL seeds for full reproducibility
    torch.manual_seed(seed)                 # Seed CPU 
    torch.cuda.manual_seed(seed)            # Seed GPU
    np.random.seed(seed)                    # Seed numpy
    random.seed(seed)                       # Seed python random
    torch.backends.cudnn.deterministic = True   # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False 


def loss_plotter(loss_epochs = None, epochs = None):
    # After collecting your data
    y = np.round(loss_epochs, 10)
    x = list(range(0, epochs))
    # Create scatter plot
    fig = plt.figure(figsize=(8, 8))
    # y = np.log1p(y)  # Log scale for better visualization
    plt.plot(x, y, alpha=0.5)
    # Add labels and title
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss over epochs')
    plt.grid(True, alpha=0.3)
    plt.tight_layout() 
    plt.ylim(0, max(y))
    plt.xlim(0, max(x))
    return fig



# Simple Model training

This model only has 2 layers (input and output). This is the model we will be using.

In [None]:
import glob
from torch_geometric.loader import DataLoader
import time
import random
import torch 

def training_loop(model, dat_batched, loss_fn, optimizer, epochs=100):
    model.train()
    # Empty lists for predictions, targets, loss at each epoch
    loss_history  =  []
    total_elapsed = 0
    for iter in range(1, epochs+1):
        start = time.time()
        epoch_loss = 0
        for path in dat_batched:
            data = torch.load(path, weights_only=False)          
            loader = DataLoader(data, batch_size=50, shuffle=True)
            for batch in loader:
                batch = batch.to(device)
                optimizer.zero_grad()
                out = model(batch)
                loss = loss_fn(out, batch.y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()   # Accumulate loss
        # Section: Best loss
        best_loss = float('inf') if iter == 1 else best_loss
        best_loss = min(best_loss, epoch_loss)
        best_epoch = iter if best_loss == epoch_loss else best_epoch
        # Append epoch loss to history
        loss_history.append(epoch_loss)
        elapsed = time.time() - start
        total_elapsed += elapsed
        print(f"Epoch {iter}: Loss = {loss},  Elapsed time: {elapsed:.2f}")
    # Summary
    print(f'>> the total elapsed time with {epochs} epochs is {total_elapsed:.2f} seconds ( {total_elapsed/60:.2f} minutes)')      
    return  loss_history, best_loss, best_epoch


## Simple model validation

In [None]:

from torch_geometric.data import Batch

def validation_fn(model, files, loss_fn, device):
    model.eval()
    # Assign variables
    val_loss, total_graphs = 0, 0
    true_idx, pred_idx = [], []
    with torch.no_grad():
        for path in files:
            data_list = torch.load(path, weights_only=False)
            total_graphs += len(data_list)
            # Batch all graphs together
            batch = Batch.from_data_list(data_list).to(device)
            # Forward pass
            out = model(batch)  # shape: [num_graphs * 30, features] or [num_graphs * 30]
            loss = loss_fn(out, batch.y)
            val_loss += loss.item()
            # Process each graph separately
            for i in range(batch.num_graphs):
                mask = batch.batch == i  # nodes belonging to graph i
                graph_y = batch.y[mask]  # 30 targets per graph
                graph_out = out[mask]    # 30 predictions per graph  
                # Get max index in 0-based
                true_idx.append(torch.argmax(graph_y).item() + 1)
                pred_idx.append(torch.argmax(graph_out).item() + 1)
    return true_idx, pred_idx, val_loss, total_graphs

In [None]:
def accuracy(true, preds):
    true = true_idx
    preds = pred_idx
    result = [a == b for a, b in zip(true, preds)]
    correct = sum(result)
    accuracy = (correct / len(true)) * 100
    return(f'>> Validation Accuracy: {accuracy:.2f}% ({correct}/{len(true)})')

# Declare data paths
directory = '/home/mriveraceron/data/exp_20251125'
train_files = glob.glob(f'{directory}/TrainBatch_*.pt')
valid_files = glob.glob(f'{directory}/ValBatch_*.pt')

# Loss function and device
loss_fn = nn.MSELoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set seed and run model
seed_fn(38)
model = model(hidden_channels=60).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

mod1_loss, mod1_BestLoss, mod1_BestEpoch = training_loop(model, train_files, loss_fn, optimizer, epochs=200)
true_idx, pred_idx, mod1_ValLoss, total_graphs = validation_fn(model, train_files, loss_fn, device)
pf1 = accuracy(true_idx, pred_idx)

# Plot the loss over epochs
fig = loss_plotter(mod1_loss, epochs = 200)
fig.savefig('/home/mriveraceron/glv-research/plots/3December-loss.png', dpi=150, bbox_inches='tight')

# Understanding pytorch Dataloader


The following code demonstrates how to get the maximum value of the y tensor for each individual sample (graph) when using DataLoader for batching in PyTorch Geometric.

## Problem 1
When graphs are batched together using DataLoader, all node features and labels are concatenated. The `batch` vector indicates which nodes belong to which graph, but we need to extract the maximum y value for each original graph separately.

In [None]:
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader
import torch
from torch_geometric.nn import global_max_pool

# Original Data objects:
dataset = []
for i in range(5):
    data = Data(x=torch.randn(10, 1), y=torch.tensor([1, 1, 100, 1, 1, 1, 1, 1, 1, (i+1)*(10**(i+1))]))
    dataset.append(data)

# Create DataLoader
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# How batches work
batch = Batch.from_data_list(dataset)                   # Create batch
print(f"batch.batch: {batch.batch}")                    # To which graph each node belongs 
print(f"batch.y: {batch.y}")                            # All y values concatenated
print(f"Total nodes: {batch.num_nodes}")                # Number of nodes in the batch
print(f"Number of graphs: {batch.num_graphs}")          # Number of graphs in the batch

# Iterate through batches
vector = []
for batch in loader:
    print(f"Graphs per batch: {batch.num_graphs}")
    print(f"Total nodes in batch: {batch.num_nodes}")   # x are the nodes
    print(f"x (nodes) shape: {batch.x.shape}")
    print(f"y shape: {batch.y.shape}")
    print(f"batch vector: {batch.batch}")
    # print(f'testing: {batch.y.unsqueeze(-1)}')
    max_y_per_graph = global_max_pool(batch.y.unsqueeze(-1), batch.batch).squeeze(-1)
    vector.extend( max_y_per_graph.tolist() )
    print(f"Max y per graph: {max_y_per_graph}")
    print("---")

sorted(vector)


# How does unsqueeze work?
x = torch.tensor([1, 2, 3, 4])
x.size()
x = torch.unsqueeze(x, -1)
x.size()
x = torch.squeeze(x, -1)
x.size()

## Problem 2

When batching graphs, all node features and targets are concatenated into a single large tensor. To aggregate node-level predictions back to graph-level predictions, we use the `scatter` function, which groups nodes by their graph assignment using the `batch.batch` vector.

In [None]:
index = torch.tensor([0, 1, 0, 2, 1, 2])
data = torch.tensor([10, 20, 30, 40, 50, 60])
scatter(data, index, dim=0, reduce='max')  # Should be [30, 50, 60]

## Poblem 3

We are interested in obtaining the index of the ith element whose value is the maximum in the tensor. We can do such with `torch.argmax`. We can do such in individual tensors but when using batches from Dataloader we will use a different approach

In [None]:
import torch
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader

# Use on individual tensor
a = torch.randn(4)
torch.argmax(a)

# Usage on batched data
dataset = []
for i in range(5):
    data = Data(x=torch.randn(10, 1), y=torch.randn(10) )
    dataset.append(data)

# Create DataLoader
batch = Batch.from_data_list(dataset).to(device)
print(f'the number of graphs are {batch.num_graphs}')

# Iterate through batches
vector = []
for i in range(batch.num_graphs):
    mask = batch.batch == i  # nodes belonging to graph i
    graph_y = batch.y[mask]  # 30 targets per graph
    idx_max = torch.argmax(graph_y).cpu().numpy().item()
    vector.append( idx_max + 1 )
vector

for d in dataset:
    idx_max = torch.argmax(d.y).cpu().numpy().item()
    print(f'maximum index : {idx_max + 1}' )

# Multiple seeds testing

As initial weight matrix is completely random, testing for multiple seeds and the choosing of the best one has to be tested. For this purpose we generate some seeds and test for improvement in loss over time.


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

def seed_fn(seed=42):
    # Set ALL seeds for full reproducibility
    torch.manual_seed(seed)                 # Seed CPU 
    torch.cuda.manual_seed(seed)            # Seed GPU
    np.random.seed(seed)                    # Seed numpy
    random.seed(seed)                       # Seed python random
    torch.backends.cudnn.deterministic = True   # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False 

def test_seeds(nseeds=10):
    # Generate 10 random integers between 1 and 10^10
    random_nums = np.random.randint(1, int(1e5) + 1, size=nseeds)
    print(random_nums)
    results = []
    for s in random_nums:
        seed_fn(s)
        model = simple_gnn_gcn(hidden_channels=60).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.01) 
        x_train, y_train, loss_train = training_loop(model, train_files, loss_fn, optimizer, epochs=100)
        x_val, y_val, loss_val = validation_fn(model, valid_files, loss_fn, device)
        # Performance metrics
        mse = mean_squared_error(y_val, x_val)
        r2 = r2_score(y_val, x_val)
        # Append results
        results.append({'seed': s, 
                        'val_mse': round(mse, 4),
                        'val_r2': round(r2, 4),
                        'mean_trloss': round(np.mean(loss_train), 4)
                })
        # Clean up GPU memory
        del model
        del optimizer
        torch.cuda.empty_cache()  # Clear GPU cache
    return results

tmp = test_seeds(nseeds=5)      


In [None]:
def preds_plotter(preds = None, tgts = None, path = None ):
    # After collecting your data
    preds = np.concatenate(preds)  # predictions
    tgts = np.concatenate(tgts)  # targets
    # Create scatter plot
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(preds, tgts, alpha=0.5)
   # Add perfect prediction line (y=x)
    plt.plot([0,  np.max(tgts)], [0,  np.max(tgts)], 'r--', label='Perfect prediction')
    plt.xlabel('Predictions')
    plt.ylabel('True Values')
    plt.title('Predictions vs True Values')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.ylim(0, max(tgts))
    plt.xlim(0, max(tgts))
    plt.tight_layout()
    return fig