In [None]:
#----------------------------------------------------------
# Section: Mount-cluster
import os
import subprocess

# Paths
remote = "/mnt/data/sur/users/mrivera"
mount_p = "/home/mriveraceron/fenix_mount"

# Mount only if not already mounted
if os.path.ismount(mount_p):
    print(">> Cluster is already mounted")
else:
    subprocess.run([
        'sshfs',
        '-o', 'ro',   # <-- read-only option
        f'mrivera@fenix.lavis.unam.mx:{remote}',
        mount_p
    ], capture_output=True, text=True)
    print(">> Cluster mounted")

We defining seeding for reproducibility.

In [None]:
import torch 
import numpy as np 
import random 

# Define function for seeding
def set_seed(seed=42):
    # Set ALL seeds for full reproducibility
    torch.manual_seed(seed)                 # Seed CPU 
    torch.cuda.manual_seed(seed)            # Seed GPU
    np.random.seed(seed)                    # Seed numpy
    random.seed(seed)                       # Seed python random
    torch.backends.cudnn.deterministic = True   # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False   

set_seed(seed=54)  # Ensure reproducibility

Define paths.

In [None]:

#---------------------------------------
import pandas as pd
import numpy as np
import torch 
import os
from datetime import datetime

# Section: Generate-paths
# Target-path
exp = "c748247a-8dc2"
# A_dir = os.path.join(mount_p, f"Experiments/{exp}/A-mat")
# tgt_dir = os.path.join(mount_p, f"Experiments/{exp}/Replica2/GNN-targets")
# data_path = os.path.join(mount_p, f"Data/{exp}.tsv")

exp_dir = "/home/mriveraceron/fenix_mount/Train-sims/4379fd40-9f0a"
A_dir = os.path.join(exp_dir, "A-mat")
tgt_dir = os.path.join(exp_dir, "GNN-targets")
odes_path = os.path.join(exp_dir, "raw-ODEs")
data_path = os.path.join(exp_dir, "parameters-sims.tsv")

# Generate ID for training.
timeID = datetime.now().strftime("Y%YM%mD%d")

#  Load-data
data = pd.read_csv(data_path, sep="\t")             # Load data
data_ids = data['id'] 

First, we define a loader function that takes the simulation ID, reads the interaction matrix, and converts it into edge weights. Then, it reads the targets, converts them into a tensor, and finally returns a Data object.

In [None]:
# SECTION: Load-function
from torch_geometric.data import Data
import pyarrow.feather as feather

def load_single_data(id, A_dir, tgt_path):
    # Load adjacency matrix 
    A_path = os.path.join(A_dir, f"A_{id}.feather")
    A = pd.read_feather(A_path).to_numpy(dtype=np.float32)
    # Vector of edge weights
    row_idx, col_idx = np.nonzero(A)
    edge_weights = A[row_idx, col_idx]
    # Convert to torch tensors efficiently
    edge_index = torch.from_numpy(np.vstack([row_idx, col_idx]).astype(np.int64))
    edge_weights = torch.from_numpy(edge_weights)
    # Load target features 
    tgt_path = os.path.join(tgt_dir, f"tgt_{id}.feather")
    tgt_table =  feather.read_table(tgt_path, columns=['K_s'])
    y_tensor = torch.from_numpy(tgt_table.to_pandas().to_numpy(dtype=np.float32))   
    # Node features - simple ones vector
    n = A.shape[0]
    x_tensor = torch.ones(n, 1, dtype=torch.float32)
    # Clean up large intermediate
    del A, tgt_table
    # Create Data object
    data = Data(
        x=x_tensor,
        edge_weights=edge_weights,
        edge_index=edge_index,
        y=y_tensor
    )
    return data

Divide data into training and validation set.
Testing: Can loading time of data be improved?

In [None]:
# SECTION: Divide-data
import random

# Load all data samples (for demo, we use only first 100 samples)
indices = list(range(1, len(data_ids)))  # Indices 1-100
random.shuffle(indices)  # Uses Python's random module (already seeded)

# Now select first 80 for training, rest for validation
indx = round(len(indices) * .8)
train_indices = indices[:indx]            # First 80 shuffled indices
val_indices = indices[indx:]              # Last 20 shuffled indices

In [None]:
# Section: Declare new function
from concurrent.futures import ThreadPoolExecutor
import time
import tqdm

def generate_data_parallel(idx, A_dir, tgt_dir, num_workers=4):  # idx is a list
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        data_list = list(executor.map(
            load_single_data,           
            idx,                       # List of IDs to iterate over
            [A_dir]*len(idx),          # Repeat A_dir for each ID
            [tgt_dir]*len(idx)         # Repeat tgt_dir for each ID
        ))
    return data_list

#-----------------------------------------------------------
# Generate data with method 1
x = train_indices[:1000]

start = time.time()
data = [load_single_data(data_ids[id], A_dir, tgt_dir) for id in x] 
not_par_time = time.time() - start

# Generate data with method 2
start = time.time()
batch_size = 100
batching_ids = data_ids[x]
num_batches = (len(batching_ids) + batch_size - 1) // batch_size
for i in range(0, num_batches):
    batch_ids = batching_ids[i:i + batch_size]
    data = generate_data_parallel(batch_ids, A_dir, tgt_dir, num_workers=4)            # First 80 after shuffling
    # print(f"Batching {i} completed...")
    print(data)

par_time = time.time() - start
print(f">> The not parallelized time is of: {not_par_time:.2f}, while the parallel time is of: {par_time:.2f}")


Next, we define the GraphConv model along with the optimizer and loss function.

In [None]:

# SECTION: Define-GNN
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
import torch.optim as optim

class simple_gnn_gcn(nn.Module):
    def __init__(self, num_node_features=1, hidden_channels=64,  num_predictions=1):
        super().__init__()
        self.conv1 = GraphConv(1, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, 1)
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weights
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight)
        x = torch.sigmoid(x)  # Outputs between 0-1
        return x  # [num_nodes]
    

# Declare optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = simple_gnn_gcn(num_node_features=1, hidden_channels=72, num_predictions=1).to(device)
loss_fn = nn.MSELoss()                                                # Loss function for regression
optimizer = optim.Adam(model.parameters(), lr=0.01) 

We then read traning sampled by batches and train the GraphConv model with them.

In [None]:
# Modified code just for testing
from tqdm import tqdm
import time
from torch_geometric.loader import DataLoader

tidx, batch_size, epochs = train_indices[:500], 100, 500
num_batches = (len(tidx) + batch_size - 1) // batch_size
start = time.time()
model.train()

# Load data method 2
start = time.time()
batch_size = 500
batching_ids = data_ids.iloc[train_indices]
num_batches = (len(batching_ids) + batch_size - 1) // batch_size
for i in range(0, num_batches):
    batch_ids = batching_ids[i:i + batch_size]
    data = generate_data_parallel(batch_ids, A_dir, tgt_dir, num_workers=6)            # First 80 after shuffling
    Dloader = DataLoader(data, batch_size=round(len(data)/10), shuffle=True)
    print(f"Batching {i}/{num_batches} completed...")
    # print(data)

elpased_batching = time.time() - start
print(f"Time elapsed ({elpased_batching:.2f}) for loading {len(train_indices)} simulations.")

# Empty lists for predictions, targets, loss at each epoch
epochs = 100
x_train, y_train, loss_epochs  = [], [], []
total_loss = 0
for epoch in tqdm(range(0, epochs), total=epochs, desc="Training model:"):
    for data in Dloader:
        optimizer.zero_grad()
        data = data.to(device)
        out = model(data)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()   # Accumulate loss
        x_train.append(out.cpu().detach().numpy()) 
        y_train.append(data.y.cpu().detach().numpy())
    loss_epochs.append(total_loss)
    
elapsed_time = time.time() - start
print(f"Elapsed time for batching and training with batch: {elapsed_time:.2f}")


In [None]:
from tqdm import tqdm
import time
from torch_geometric.loader import DataLoader

def train_batches(tidx, batch_size = 1000, epochs=500):
    num_batches = (len(tidx) + batch_size - 1) // batch_size
    start = time.time()
    model.train()
    # Empty lists for predictions, targets, loss at each epoch
    x_train, y_train, loss_epochs  = [], [], []
    for i in tqdm(range(0, len(tidx), batch_size), total=num_batches, desc="Loading batches"):
        data = [load_single_data(data_ids[id], A_dir, tgt_dir) for id in tidx[i:i + batch_size]]             # First 80 after shuffling
        Dloaded = DataLoader(data, batch_size=round(len(data)/10), shuffle=True)
        for epoch in range(epochs):
            total_loss = 0
            for data in Dloaded:
                optimizer.zero_grad()
                data = data.to(device)
                out = model(data)
                loss = loss_fn(out, data.y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()   # Accumulate loss
                if epoch==(epochs-1):
                    x_train.append(out.cpu().detach().numpy()) 
                    y_train.append(data.y.cpu().detach().numpy())
            loss_epochs.append(total_loss)
            if epoch % 200 == 0:
                print(f"Epoch {epoch}: Loss = {total_loss:.4f}")
    elapsed_time = time.time() - start
    print(f"Elapsed time for batching and training with batch: {elapsed_time:.2f}")