In [1]:
import math
import numpy as np
import wandb
import random

import torch
import torch_geometric
from torch_geometric.data import Data

import sys
import os
from tqdm import tqdm

# Add the 'scripts' directory to the Python path
scripts_path = os.path.abspath(os.path.join('..'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)
    
import joblib

# Now you can import the gnn_io module
import gnn_io as gio

import gnn_architectures as garch

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
import os 
# 
# Create base directory for the run
base_dir = '../../data/runs_1234_bliblablu/'
unique_run_dir = os.path.join(base_dir, "this_is_it")
os.makedirs(unique_run_dir, exist_ok=True)

# Define the paths here
def get_paths(base_dir, unique_model_description):
    data_path = os.path.join(base_dir, unique_model_description)
    os.makedirs(data_path, exist_ok=True)
    model_save_path = os.path.join(data_path, 'trained_model/model.pth')
    path_to_save_dataloader = os.path.join(data_path, 'data_created_during_training/')
    config_save_path = os.path.join(data_path, 'trained_models/config.json')
    os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
    os.makedirs(path_to_save_dataloader, exist_ok=True)
    data_dict_list = torch.load('../../data/train_data/dataset_1pm_0-4400.pt')
    return data_dict_list, model_save_path, config_save_path, path_to_save_dataloader

def create_dataloaders_and_save_test_set(dataset_normalized, batch_size, path_to_save_dataloader):
    train_dl, valid_dl, test_dl = gio.create_dataloaders(batch_size=batch_size, dataset=dataset_normalized, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)
    gio.save_dataloader(test_dl, path_to_save_dataloader + 'test_dl.pt')
    gio.save_dataloader_params(test_dl, path_to_save_dataloader + 'test_loader_params.json')
    return train_dl, valid_dl

def prepare_data(data_dict_list, indices_of_datasets_to_use, path_to_save_dataloader):
    datalist = [Data(x=d['x'], edge_index=d['edge_index'], pos=d['pos'], y=d['y']) for d in data_dict_list]
    dataset_only_relevant_dimensions = gio.cut_dimensions(dataset=datalist, indices_of_dimensions_to_keep=indices_of_datasets_to_use)
    dataset_normalized = gio.normalize_dataset(dataset_only_relevant_dimensions, y_scalar=None, x_scalar_list=None, pos_scalar=None, directory_path=path_to_save_dataloader)
    return dataset_normalized

data_dict_list, model_save_path, config_save_path, path_to_save_dataloader = get_paths(base_dir, "this_is_it")
dataset_normalized = prepare_data(data_dict_list, [0,1,3,4], path_to_save_dataloader)

# train_dl, valid_dl = create_dataloaders_and_save_test_set(dataset_normalized, 16, path_to_save_dataloader)

# config = setup_wandb(params['project_name'], {
#     "epochs": params['num_epochs'],
#     "batch_size": params['batch_size'],
#     "lr": params['lr'],
#     "gradient_accumulation_steps": params['gradient_accumulation_steps'],
#     "early_stopping_patience": params['early_stopping_patience'],
#     "hidden_layers_base_for_point_net_conv": params['hidden_layers_base_for_point_net_conv'],
#     "hidden_layer_structure": params['hidden_layer_structure'],
#     "indices_to_use": params['indices_of_datasets_to_use'],
#     "dataset_length": len(dataset_normalized), 
#     "in_channels": params['in_channels'],
#     "out_channels": params['out_channels'],
# })

# gnn_instance = garch.MyGnn(in_channels=6, out_channels=1, hidden_layers_base_for_point_net_conv=64, hidden_layer_structure=[64,128])
# model = gnn_instance.to(device)
loss_fct = torch.nn.MSELoss()


Scaler created for x values: StandardScaler()
Scaler created for x values: StandardScaler()
Scaler created for x values: StandardScaler()
Scaler created for x values: StandardScaler()
Scaler created for pos features: StandardScaler()
Scaler created for y values: StandardScaler()


In [5]:
baseline_loss_mean_target = gio.compute_baseline_of_mean_target(dataset=dataset_normalized, loss_fct=loss_fct)
baseline_loss = gio.compute_baseline_of_no_policies(dataset=dataset_normalized, loss_fct=loss_fct)
print("baseline loss " + str(baseline_loss_mean_target) )
print("baeline loss no policies " + str(baseline_loss) )

mean_y_normalized: 
5.1006186e-09
median_y_normalized: 
-0.06859979
Mean y normalized tensor: 
torch.Size([133854208, 1])
tensor([[ 0.3793],
        [-0.6240],
        [ 1.3825],
        [ 0.8092],
        [ 0.8450],
        [-0.7673],
        [-0.7673],
        [-0.0686],
        [-0.1223],
        [-1.8601]])
Target tensor: 
torch.Size([133854208, 1])
tensor([[5.1006e-09],
        [5.1006e-09],
        [5.1006e-09],
        [5.1006e-09],
        [5.1006e-09],
        [5.1006e-09],
        [5.1006e-09],
        [5.1006e-09],
        [5.1006e-09],
        [5.1006e-09]])
no policies 
torch.Size([133854208, 1])
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
torch.Size([133854208, 1])
tensor([[ 0.3793],
        [-0.6240],
        [ 1.3825],
        [ 0.8092],
        [ 0.8450],
        [-0.7673],
        [-0.7673],
        [-0.0686],
        [-0.1223],
        [-1.8601]])
baseline loss 1.0
baelin

In [7]:
actual_difference_vol_car = np.concatenate([data.y for data in dataset_normalized])

target_tensor = 1.5 * np.ones(actual_difference_vol_car.shape) # presume no difference in vol car due to policy

target_tensor = torch.tensor(target_tensor, dtype=torch.float32)
actual_difference_vol_car = torch.tensor(actual_difference_vol_car, dtype=torch.float32)

print('no policies ')
print(target_tensor.shape)
print(target_tensor[:10])
print(actual_difference_vol_car.shape)
print(actual_difference_vol_car[:10])

# Compute the loss
loss = loss_fct(actual_difference_vol_car, target_tensor)
print(loss)

no policies 
torch.Size([133854208, 1])
tensor([[1.5000],
        [1.5000],
        [1.5000],
        [1.5000],
        [1.5000],
        [1.5000],
        [1.5000],
        [1.5000],
        [1.5000],
        [1.5000]])
torch.Size([133854208, 1])
tensor([[ 0.3793],
        [-0.6240],
        [ 1.3825],
        [ 0.8092],
        [ 0.8450],
        [-0.7673],
        [-0.7673],
        [-0.0686],
        [-0.1223],
        [-1.8601]])
tensor(3.2500)


: 

## 1. Define model and parameters

In [4]:
gio.compute_baseline_of_mean_target(dataset=dataset_normalized, )

TypeError: compute_baseline_of_mean_target() missing 1 required positional argument: 'loss_fct'

In [None]:
def define_hidden_layer_structure(list_of_halfs_and_duplicates: list, hidden_layer_size: int):
    """
    Generates a list of hidden layer sizes based on an initial size and a list of instructions.

    Parameters:
    list_of_halfs_and_duplicates (list): List of instructions where 1 means double the size,
                                         0 means the same size, and -1 means half the size.
    hidden_layer_size (int): The initial size of the hidden layer.

    Returns:
    list: A list of integers representing the sizes of the hidden layers.
    """
    if not all(isinstance(i, int) and i in [-1, 0, 1] for i in list_of_halfs_and_duplicates):
        raise ValueError("list_of_halfs_and_duplicates must contain only -1, 0, or 1.")
    if hidden_layer_size <= 0:
        raise ValueError("hidden_layer_size must be a positive integer.")
    
    result_list = [hidden_layer_size]
    for i in list_of_halfs_and_duplicates:
        if i == 1:
            result_list.append(int(result_list[-1] * 2))
        elif i == 0:
            result_list.append(result_list[-1])
        elif i == -1:
            result_list.append(int(result_list[-1] / 2))
    result_list.append(hidden_layer_size)
    return result_list
        
hidden_layer_structure = define_hidden_layer_structure([1, -1, 0, 1, 0], 16)

def define_layers(hidden_layer_structure: list[int], gat_and_conv_structure: list[int]) -> list:
    """
    Generates a list of GNN layers and ReLU activations based on the provided hidden layer structure.

    Parameters:
    hidden_layer_structure (list[int]): A list of integers representing the sizes of the hidden layers.
    gat_and_conv_structure (list[int]): A list specifying the type of GNN layer to use. 
        Use 1 for 'GATConv' and -1 for 'GCNConv'.
        Note that the size of hidden_layer_structure must be the size of gat_and_conv_structure + 1.

    Returns:
    list: A list of tuples and ReLU activations, where each tuple contains a GNN layer and a string describing the data flow.
    
    Raises:
    ValueError: If an invalid layer type is specified or if the input lengths are incompatible.
    """
    if len(hidden_layer_structure) != len(gat_and_conv_structure) + 1:
        raise ValueError("The size of hidden_layer_structure must be the size of gat_and_conv_structure + 1.")

    # Mapping layer types to their corresponding classes
    layer_types = {
        1: torch_geometric.nn.GATConv,
        -1: torch_geometric.nn.GCNConv
    }

    layers = []
    for idx in range(len(hidden_layer_structure) - 1):
        layer_type = gat_and_conv_structure[idx]
        if layer_type in layer_types:
            layer_class = layer_types[layer_type]
            layers.append((layer_class(hidden_layer_structure[idx], hidden_layer_structure[idx + 1]), 'x, edge_index -> x'))
        else:
            raise ValueError("Invalid layer_type. Choose 1 for 'GATConv' or -1 for 'GCNConv'.")
        layers.append(torch.nn.ReLU(inplace=True))
    
    return layers

layers = define_layers(hidden_layer_structure=hidden_layer_structure, gat_and_conv_structure=[1, -1, 1, 1, 1, 1])

In [None]:
hidden_layer_structure

[16, 32, 16, 16, 32, 32, 16]

In [None]:
layers

[(GATConv(16, 32, heads=1), 'x, edge_index -> x'),
 ReLU(inplace=True),
 (GCNConv(32, 16), 'x, edge_index -> x'),
 ReLU(inplace=True),
 (GATConv(16, 16, heads=1), 'x, edge_index -> x'),
 ReLU(inplace=True),
 (GATConv(16, 32, heads=1), 'x, edge_index -> x'),
 ReLU(inplace=True),
 (GATConv(32, 32, heads=1), 'x, edge_index -> x'),
 ReLU(inplace=True),
 (GATConv(32, 16, heads=1), 'x, edge_index -> x'),
 ReLU(inplace=True)]

In [None]:
# Define parameters 
num_epochs = 1000
project_name = "try_overfitting_3"
path_to_save_dataloader = "../../data/data_created_during_training_needed_for_testing/"
indices_of_datasets_to_use = [0, 1, 3, 4]

loss_fct = torch.nn.MSELoss()
batch_size = 4
output_layer_parameter = 'gat'
hidden_size_parameter = 64
gat_layer_parameter = 5
gcn_layer_parameter = 0
lr = 0.001
in_channels = len(indices_of_datasets_to_use) + 2 # dimensions of the x vector + 2 (pos)
out_channels = 1 # we are predicting one value
early_stopping_patience = 10

data_dict_list = torch.load('../../data/train_data/dataset_1pm_0-3500_new.pt')

## 2. Load data

In [None]:
# Reconstruct the Data objects
datalist = [Data(x=d['x'], edge_index=d['edge_index'], pos=d['pos'], y=d['y']) for d in data_dict_list]

In [None]:
datalist_new=datalist[0:3000]

# for data in datalist[3200: len(datalist)]:
#     datalist_new.append(data)
    
dataset_length = len(datalist_new)

In [None]:
dataset_length

3000

In [None]:
dataset_only_relevant_dimensions = gio.cut_dimensions(dataset=datalist_new, indices_of_dimensions_to_keep=indices_of_datasets_to_use)
dataset_normalized = gio.normalize_dataset(dataset=dataset_only_relevant_dimensions, y_scalar=None, x_scalar_list=None, pos_scalar=None, directory_path=path_to_save_dataloader)

Input normalisation: standardScalar
Scaler created for x values: StandardScaler()
Scaler created for x values: StandardScaler()
Scaler created for x values: StandardScaler()
Scaler created for x values: StandardScaler()
Scaler created for pos features: StandardScaler()


In [None]:
baseline_error = gio.compute_baseline_of_no_policies(dataset=dataset_normalized, loss_fct=loss_fct)
print(f'Baseline error no policies: {baseline_error}')

baseline_error = gio.compute_baseline_of_mean_target(dataset=dataset_normalized, loss_fct=loss_fct)
print(f'Baseline error mean: {baseline_error}')

Baseline error no policies: 0.3216273784637451
Baseline error mean: 0.0032576550729572773


## 4. Train the model

We first find a good model for one batch. 

In [None]:
train_dl, valid_dl, test_dl = gio.create_dataloaders(batch_size = batch_size, dataset=dataset_normalized, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)
gio.save_dataloader(test_dl, path_to_save_dataloader + 'test_dl_' + unique_model_description + '.pt')
gio.save_dataloader_params(test_dl, path_to_save_dataloader + 'test_loader_params_' + unique_model_description+ '.json')

Total dataset length: 3000
Training subset length: 2100
Validation subset length: 450
Test subset length: 450


In [None]:
print(f"Running with {torch.cuda.device_count()} GPUS")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Name is ", torch.cuda.get_device_name())

Running with 2 GPUS
Name is  NVIDIA RTX A5000


In [None]:
wandb.login()
wandb.init(
    project=project_name,
    config={
        "epochs": num_epochs,
        "batch_size": batch_size,
        "lr": lr,
        "early_stopping_patience": 10,
        "hidden_layer_size": hidden_size_parameter,
        "gat_layers": gat_layer_parameter,
        "gcn_layers": gcn_layer_parameter,
        "output_layer": output_layer_parameter,
        "indices_to_use": indices_of_datasets_to_use,
        "dataset_length": dataset_length
    }
)
config = wandb.config

print("output_layer: ", output_layer_parameter)
print("hidden_size: ", hidden_size_parameter)
print("gat_layers: ", gat_layer_parameter)
print("gcn_layers: ", gcn_layer_parameter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
early_stopping = gio.EarlyStopping(patience=early_stopping_patience, verbose=True)
# torch.set_printoptions(precision=4, sci_mode=False)

# gnn_instance = garch.MyGnn(in_channels=in_channels, out_channels=out_channels, hidden_size=hidden_size_parameter, gat_layers=gat_layer_parameter, gcn_layers=gcn_layer_parameter, output_layer=output_layer_parameter)

gnn_instance = garch.MyGnnHardCoded(in_channels=in_channels, out_channels=out_channels, hidden_size=hidden_size_parameter, output_layer=output_layer_parameter)

model = gnn_instance.to(device)

best_val_loss, best_epoch = garch.train(model, config=config, 
                                loss_fct=loss_fct, 
                                optimizer=torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.0),
                                train_dl=train_dl, valid_dl=valid_dl,
                                device=device, early_stopping=early_stopping,
                                use_existing_checkpoint=True, path_existing_checkpoints = "../../data/checkpoints_batchsize_8/")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33menatterer[0m ([33mtum-traffic-engineering[0m). Use [1m`wandb login --relogin`[0m to force relogin


output_layer:  gat
hidden_size:  64
gat_layers:  5
gcn_layers:  0
Model initialized
MyGnnHardCoded(
  (pointLayer): PointNetConv(local_nn=Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  ), global_nn=Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
  ))
  (graph_layers): Sequential(
    (0) - GATConv(64, 128, heads=1): x, edge_index -> x
    (1) - ReLU(inplace=True): x -> x
    (2) - GATConv(128, 32, heads=1): x, edge_index -> x
    (3) - ReLU(inplace=True): x -> x
    (4) - GATConv(32, 128, heads=1): x, edge_index -> x
    (5) - ReLU(inplace=True): x -> x
    (6) - GATConv(128, 64, heads=1): x, edge_index -> x
    (7) - ReLU(inplace=True): x -> x
    (8) - GATConv(64, 64, heads=1): x, edge_index -> x
    (9) - Re

525it [00:51, 10.17it/s]


epoch: 0, validation loss: 0.0031139697319110936


525it [00:51, 10.26it/s]


epoch: 1, validation loss: 0.0030884561128914356


525it [00:52,  9.95it/s]


epoch: 2, validation loss: 0.0030703635420650244


525it [00:50, 10.34it/s]


epoch: 3, validation loss: 0.003056793685887108


525it [00:51, 10.15it/s]


epoch: 4, validation loss: 0.0030368021689355373
Model checkpoint saved at epoch 4


525it [00:51, 10.10it/s]


epoch: 5, validation loss: 0.003034990280866623


525it [00:51, 10.25it/s]


epoch: 6, validation loss: 0.002995197904353912


525it [00:52, 10.02it/s]


epoch: 7, validation loss: 0.003033470366545747
EarlyStopping counter: 1 out of 10


525it [00:50, 10.30it/s]


epoch: 8, validation loss: 0.00297945411875844


525it [00:53,  9.80it/s]


epoch: 9, validation loss: 0.002970233248480785
Model checkpoint saved at epoch 9


525it [00:52,  9.97it/s]


epoch: 10, validation loss: 0.0029909785371273756
EarlyStopping counter: 1 out of 10


525it [00:52, 10.08it/s]


epoch: 11, validation loss: 0.0029343392316713533


525it [00:52,  9.93it/s]


epoch: 12, validation loss: 0.0029296712080362887


525it [00:50, 10.30it/s]


epoch: 13, validation loss: 0.0029019871087893182


525it [00:53,  9.89it/s]


epoch: 14, validation loss: 0.0029067131752494426
Model checkpoint saved at epoch 14
EarlyStopping counter: 1 out of 10


525it [00:51, 10.10it/s]


epoch: 15, validation loss: 0.0029109576634601154
EarlyStopping counter: 2 out of 10


525it [00:51, 10.16it/s]


epoch: 16, validation loss: 0.0029723022660586686
EarlyStopping counter: 3 out of 10


525it [00:51, 10.13it/s]


epoch: 17, validation loss: 0.0028666958307633095


525it [00:51, 10.12it/s]


epoch: 18, validation loss: 0.002884331624954939
EarlyStopping counter: 1 out of 10


525it [00:52,  9.97it/s]


epoch: 19, validation loss: 0.002844018628820777
Model checkpoint saved at epoch 19


525it [00:51, 10.17it/s]


epoch: 20, validation loss: 0.0029125306034147474
EarlyStopping counter: 1 out of 10


525it [00:53,  9.79it/s]


epoch: 21, validation loss: 0.002852027990599781
EarlyStopping counter: 2 out of 10


525it [00:51, 10.21it/s]


epoch: 22, validation loss: 0.00285186804831028
EarlyStopping counter: 3 out of 10


525it [00:51, 10.28it/s]


epoch: 23, validation loss: 0.0028821455780416727
EarlyStopping counter: 4 out of 10


525it [00:52, 10.07it/s]


epoch: 24, validation loss: 0.002897255790246799
Model checkpoint saved at epoch 24
EarlyStopping counter: 5 out of 10


525it [00:52, 10.05it/s]

: 

: 

In [None]:
model_save_path = "../../data/trained_models/model_1.pth"
torch.save(model.state_dict(), model_save_path)

In [None]:
# model_path = '../../data/trained_models/model_' + unique_model_description + '.pth'

# # Save the model state dictionary and configuration
# torch.save({
#     'state_dict': model.state_dict(),
#     'config': {
#         'in_channels': model.in_channels,
#         'out_channels': model.out_channels,
#         'hidden_size': model.hidden_size,
#         'gat_layers': model.gat_layers,
#         'gcn_layers': model.gcn_layers,ls
#         'output_layer': model.output_layer
#     }
# }, model_path)