This file was created for running pytorch learning with CUDA.

In [None]:
# SECTION: Define-GNN

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
import torch.optim as optim
import numpy as np
import random

class simple_gnn_gcn(nn.Module):
    def __init__(self,  hidden_channels=64):
        super().__init__()
        self.conv1 = GraphConv(1, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, 1)
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weights
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight)
        x = torch.sigmoid(x)  # Outputs between 0-1
        return x  # [num_nodes]

def seed_fn(seed=42):
    # Set ALL seeds for full reproducibility
    torch.manual_seed(seed)                 # Seed CPU 
    torch.cuda.manual_seed(seed)            # Seed GPU
    np.random.seed(seed)                    # Seed numpy
    random.seed(seed)                       # Seed python random
    torch.backends.cudnn.deterministic = True   # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False 



# Training mode

For training the model we use the next function.

In [None]:
import os
import glob
from tqdm import tqdm
from torch_geometric.loader import DataLoader
import time
import pandas as pd
import numpy as np
import random
import torch 

# Declare data paths
directory = '/home/mriveraceron/data/exp_20251125'
train_files = glob.glob(f'{directory}/TrainBatch_*.pt')


# Testing lines
path = train_files[1]
data = torch.load(path, weights_only=False) 
loader = DataLoader(data, batch_size=32, shuffle=False)

def training_loop(model, dat_batched, loss_fn, optimizer, epochs=100):
    model.train()
    # Empty lists for predictions, targets, loss at each epoch
    x_train, y_train, loss_epochs  = [], [], []
    total_elapsed = 0
    for iter in range(1, epochs+1):
        start = time.time()
        total_loss = 0
        for path in dat_batched:
            data = torch.load(path, weights_only=False)          
            loader = DataLoader(data, batch_size=50, shuffle=True)
            for batch in loader:
                batch = batch.to(device)
                optimizer.zero_grad()
                out = model(batch)
                loss = loss_fn(out, batch.y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()   # Accumulate loss
                # print(data)
                if iter == epochs:
                    x_train.append(out.cpu().detach().numpy()) 
                    y_train.append(batch.y.cpu().detach().numpy())
        loss_epochs.append(total_loss)
        elapsed = time.time() - start
        total_elapsed += elapsed
        print(f"Epoch {iter}: Loss = {total_loss:.4f},  Elapsed time: {elapsed:.2f}")
    print(f'>> the total elapsed time with {epochs} epochs is {total_elapsed:.2f} seconds ( {total_elapsed/60:.2f} minutes)')
    # Concatenate all predictions
    x_train = np.concatenate(x_train, axis=0)
    y_train = np.concatenate(y_train, axis=0)      
    return x_train, y_train, loss_epochs


# Run training loop.
seed_fn(38)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = simple_gnn_gcn(hidden_channels=60).to(device)
loss_fn = nn.MSELoss()                                                # Loss function for regression
optimizer = optim.Adam(model.parameters(), lr=0.01) 
xtr, ytr, loss_train = training_loop(model, train_files, loss_fn, optimizer, epochs=1000)




## Plot loss over epochs

# Validation mode

## Maximum keystoness for each community vs prediction

This validation process evaluates the performance of a previously trained model by measuring its ability to predict maximum y values across different communities. The validation metric is expressed as a percentage accuracy.

In [None]:
# Get validation files
valid_files = glob.glob(f'{directory}/ValBatch_*.pt')

def validation_fn(model, files, loss_fn, device):
    model.eval()  # Set to evaluation mode
    total_loss = 0
    true_max, predicted_max = [], []
    with torch.no_grad():  # Disable gradient computation
        for path in files:
            data_list = torch.load(path, weights_only=False)                            # Load data
            true_max.extend( [round(data.y.max().item(), 5) for data in data_list] )    # True max per sample
            loader = DataLoader(data_list, batch_size=50, shuffle=False)                # Batch data   
            for batch in loader:
                data = batch.to(device)
                # Forward pass only
                out = model(data)
                loss = loss_fn(out, data.y)
                total_loss += loss.item()
                max_values = out.max(dim=1)[0].detach().cpu().tolist()   # Predicted max per sample
                predicted_max.extend([round(x, 5) for x in max_values])
                print(f"Graphs per batch: {batch.num_graphs}")
                print(f"Total nodes in batch: {batch.num_nodes}")   # x are the nodes
                print(f"x (nodes) shape: {batch.x.shape}")
                print(f"y shape: {batch.y.shape}")
                print(f"batch vector: {batch.batch}")
                print(f'Shape of max outs {out.max(dim=1)[0].shape}')
                print("---")
    return true_max, predicted_max, total_loss

# How zip works
# a = [0.5, 0.80, 1.20]
# b = [0, 0.60, 0.80]
# x = [a - b for a, b in zip(a, b)]

true_max, predicted_max, total_loss = validation_fn(model, valid_files, loss_fn, device)
diff = [a - b for a, b in zip(true_max, predicted_max)]
 
    
    


# Some other code below

In [None]:
valid_files = glob.glob(f'{directory}/ValBatch_*.pt')

# Get maximum y values (keystone) for each community
max_y_values = []
for path in valid_files:
    # Load data from file
    data_list = torch.load(path, weights_only=False)
    # Get max y value for each community
    max_y_values.append( [round(data.y.max().item(), 5) for data in data_list] )
# Flatten list
max_y_flatten = [item for sublist in max_y_values for item in sublist]



In [None]:
valid_files = glob.glob(f'{directory}/ValBatch_*.pt')

def validation_fn(model, files, loss_fn, device):
    model.eval()  # Set to evaluation mode
    total_loss = 0
    predicted_values, true_values, true_max, preds_max = [], [], [], []
    with torch.no_grad():  # Disable gradient computation
        for path in files:
            data_list = torch.load(path, weights_only=False)
            true_values.append( [round(data.y.max().item(), 5) for data in data_list] )
            loader = DataLoader(data, batch_size=50, shuffle=False)  
            for batch in loader:
                batch = batch.to(device)
                # Forward pass only
                out = model(data)
                loss = loss_fn(out, data.y)
                total_loss += loss.item()
                # Get max predictions per sample
                tmp = out.max(dim=1)[0] 
                preds_max.append(tmp.cpu().detach().numpy())
                predicted_values.append(out.cpu().detach().numpy())    # Predictions
                true_values.append(batch.y.cpu().detach().numpy())     # Taregts
                print(f"Batch size: {batch.num_graphs}")  # For PyG Data objects
                print(f"Output shape: {out.shape}")
                print(f"Output values:\n{out}")
                print(f"Max preds shape: {preds_max.shape}")
                print(f"Max preds values: {preds_max}")
        # Concatenate all predictions
        true_max = np.concatenate(true_max, axis=0)
        preds_max = np.concatenate(preds_max, axis=0)
        predicted_values = np.concatenate(predicted_values, axis=0)
        true_values = np.concatenate(true_values, axis=0)
    return true_max, preds_max, predicted_values, true_values, loss

x_val, y_val, loss_val = validation_fn(model, valid_files, loss_fn, device)

# Testing lines
for path in valid_files:
    total_loss = 0
    predicted_values, true_values, true_max, preds_max = [], [], [], []
    path = valid_files[0]
    data_list = torch.load(path, weights_only=False)
    # Get max y value for each community
    values = [round(data.y.max().item(), 4) for data in data_list]
    true_max.extend ( values )
    print(f'The true max values are: {len(values)}')
    # Batch data
    loader = DataLoader(data, batch_size=50, shuffle=False) 
    for batch in loader:
        batch = batch.to(device)
        # Forward pass only
        out = model(batch)
        loss = loss_fn(out, batch.y)
        total_loss += loss.item()
        # Get max predictions per sample
        tmp = out.max(dim=1)[0] 
        # Get predictions and targets
        for i in range(batch.num_graphs):
            # Load individual graph
            start_idx = batch.ptr[i]
            end_idx = batch.ptr[i + 1]
            # Get predictions for this graph and find max
            graph_preds = out[start_idx:end_idx]
            # print(f'This is a testing, the number of features is: {len(out[start_idx:end_idx])}')
            max_pred = round(graph_preds.max().item(), 4)
            preds_max.append(max_pred)
        print(f'The number of graphs in the batch is: {batch.num_graphs}')
        print(f"Output shape: {out.shape}")
        print(f"tmp shape: {tmp.shape}")


for i in range(50):
    print(i)

loader = DataLoader(data, batch_size=50, shuffle=False)  
for batch in loader:
    batch = batch.to(device)
    # Forward pass only
    out = model(data)
    loss = loss_fn(out, data.y)
    total_loss += loss.item()
    # Get max predictions per sample
    tmp = out.max(dim=1)[0] 
    preds_max.append(tmp.cpu().detach().numpy())
    predicted_values.append(out.cpu().detach().numpy())    # Predictions
    true_values.append(batch.y.cpu().detach().numpy())     # Taregts
    print(f"Batch size: {batch.num_graphs}")  # For PyG Data objects
    print(f"Output shape: {out.shape}")
    print(f"Output values:\n{out}")
    print(f"Max preds shape: {preds_max.shape}")
    print(f"Max preds values: {preds_max}")

# Understanding pytorch Dataloader


The following code demonstrates how to get the maximum value of the y tensor for each individual sample (graph) when using DataLoader for batching in PyTorch Geometric.

## Problem
When graphs are batched together using DataLoader, all node features and labels are concatenated. The `batch` vector indicates which nodes belong to which graph, but we need to extract the maximum y value for each original graph separately.

In [None]:
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader
import torch
from torch_geometric.nn import global_max_pool

# Original Data objects:
dataset = []
for i in range(5):
    data = Data(x=torch.randn(10, 1), y=torch.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, (i+1)*(10**(i+1))]))
    dataset.append(data)

# Create DataLoader
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# How batches work
batch = Batch.from_data_list(dataset)                   # Create batch
print(f"batch.batch: {batch.batch}")                    # To which graph each node belongs 
print(f"batch.y: {batch.y}")                            # All y values concatenated
print(f"Total nodes: {batch.num_nodes}")                # Number of nodes in the batch
print(f"Number of graphs: {batch.num_graphs}")          # Number of graphs in the batch

# Iterate through batches
vector = []
for batch in loader:
    print(f"Graphs per batch: {batch.num_graphs}")
    print(f"Total nodes in batch: {batch.num_nodes}")   # x are the nodes
    print(f"x (nodes) shape: {batch.x.shape}")
    print(f"y shape: {batch.y.shape}")
    print(f"batch vector: {batch.batch}")
    # print(f'testing: {batch.y.unsqueeze(-1)}')
    max_y_per_graph = global_max_pool(batch.y.unsqueeze(-1), batch.batch).squeeze(-1)
    vector.extend( max_y_per_graph.tolist() )
    print(f"Max y per graph: {max_y_per_graph}")
    print("---")

sorted(vector)

# How does unsqueeze work?
x = torch.tensor([1, 2, 3, 4])
x.size()
torch.unsqueeze(x, -1).size()

# Multiple seeds testing

As initial weight matrix is completely random, testing for multiple seeds and the choosing of the best one has to be tested. For this purpose we generate some seeds and test for improvement in loss over time.


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

def seed_fn(seed=42):
    # Set ALL seeds for full reproducibility
    torch.manual_seed(seed)                 # Seed CPU 
    torch.cuda.manual_seed(seed)            # Seed GPU
    np.random.seed(seed)                    # Seed numpy
    random.seed(seed)                       # Seed python random
    torch.backends.cudnn.deterministic = True   # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False 

def test_seeds(nseeds=10):
    # Generate 10 random integers between 1 and 10^10
    random_nums = np.random.randint(1, int(1e5) + 1, size=nseeds)
    print(random_nums)
    results = []
    for s in random_nums:
        seed_fn(s)
        model = simple_gnn_gcn(hidden_channels=60).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.01) 
        x_train, y_train, loss_train = training_loop(model, train_files, loss_fn, optimizer, epochs=100)
        x_val, y_val, loss_val = validation_fn(model, valid_files, loss_fn, device)
        # Performance metrics
        mse = mean_squared_error(y_val, x_val)
        r2 = r2_score(y_val, x_val)
        # Append results
        results.append({'seed': s, 
                        'val_mse': round(mse, 4),
                        'val_r2': round(r2, 4),
                        'mean_trloss': round(np.mean(loss_train), 4)
                })
        # Clean up GPU memory
        del model
        del optimizer
        torch.cuda.empty_cache()  # Clear GPU cache
    return results

tmp = test_seeds(nseeds=5)      


## Plotting


In [None]:
# Section: Plotting
import matplotlib.pyplot as plt
import numpy as np

def loss_plotter(loss_epochs = None, epochs = None):
    # After collecting your data
    y = np.round(loss_epochs, 3)
    x = list(range(0, epochs))
    # Create scatter plot
    fig = plt.figure(figsize=(8, 8))
    plt.plot(x, y, alpha=0.5)
    # Add perfect prediction line (y=x)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss over epochs')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    ymin = min(y) 
    plt.ylim(ymin, max(y))
    plt.xlim(0, max(x))
    return fig

fig = loss_plotter(loss_epochs, epochs)
fig.savefig('/mnt/data/sur/users/mrivera/Plots/4379fd40-9f0a-loss.png', dpi=150, bbox_inches='tight')

In [None]:
def preds_plotter(preds = None, tgts = None, path = None ):
    # After collecting your data
    preds = np.concatenate(preds)  # predictions
    tgts = np.concatenate(tgts)  # targets
    # Create scatter plot
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(preds, tgts, alpha=0.5)
   # Add perfect prediction line (y=x)
    plt.plot([0,  np.max(tgts)], [0,  np.max(tgts)], 'r--', label='Perfect prediction')
    plt.xlabel('Predictions')
    plt.ylabel('True Values')
    plt.title('Predictions vs True Values')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.ylim(0, max(tgts))
    plt.xlim(0, max(tgts))
    plt.tight_layout()
    return fig