In [10]:
import math
import numpy as np
import wandb

import torch
import torch_geometric
from torch_geometric.data import Data

import sys
import os
from tqdm import tqdm
import random

# Add the 'scripts' directory to the Python path
scripts_path = os.path.abspath(os.path.join('..'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)
    
import joblib

# Now you can import the gnn_io module
import gnn_io as gio

import gnn_architectures as garch
# torch.set_printoptions(precision=4, sci_mode=False)

## 1. Define model and parameters

In [11]:
# Define parameters 
num_epochs = 1000
project_name = "test_different_parameters"
path_to_save_dataloader = "../../data/data_created_during_training_needed_for_testing/"
# indices_of_datasets_to_use = [0, 1, 2, 3]

loss_fct = torch.nn.MSELoss()
out_channels = 1 # we are predicting one value
early_stopping_patience = 10

config={
        "epochs": 1000,
        "batch_size": 32,
        "lr": 0.001,
        "loss_fct": "MSELoss",
        "early_stopping_patience": early_stopping_patience,
        "hidden_layer_size": 32,
        "gat_layers": 2,
        "gcn_layers": 0,
        "output_layer": 'gat',
        "in_channels": 6, # dimensions of the x vector + 2 (pos)
        "out_channels": 1,
        # "dropout": 0.15,
    }

unique_model_description = f"mse_loss_hidden_{config['hidden_layer_size']}_gat_{config['gat_layers']}_gcn_{config['gcn_layers']}_lr_{config['lr']}_batch_{config['batch_size']}_epochs_{config['epochs']}_early_{config['early_stopping_patience']}_out_{config['output_layer']}_"

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

data_dict_list = torch.load('../../data/train_data/dataset_1pm_0-3100.pt')

## 2. Load data

In [12]:
# Reconstruct the Data objects
datalist = [Data(x=d['x'], edge_index=d['edge_index'], pos=d['pos'], y=d['y']) for d in data_dict_list]
dataset = gio.normalize_dataset(datalist, y_scalar=None, x_scalar_list=None, pos_scalar=None, directory_path=path_to_save_dataloader)

In [13]:
baseline_error = gio.compute_baseline_of_no_policies(dataset=dataset, loss_fct=loss_fct)
print(f'Baseline error no policies: {baseline_error}')

baseline_error = gio.compute_baseline_of_mean_target(dataset=dataset, loss_fct=loss_fct)
print(f'Baseline error mean: {baseline_error}')

Baseline error no policies: 0.3216274082660675
Baseline error mean: 0.0032576550729572773


## 4. Train the model

We first find a good model for one batch. 

In [14]:
train_dl, valid_dl, test_dl = gio.create_dataloaders(batch_size = config['batch_size'], dataset=dataset, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)
gio.save_dataloader(test_dl, path_to_save_dataloader + 'test_dl_' + unique_model_description + '.pt')
gio.save_dataloader_params(test_dl, path_to_save_dataloader + 'test_loader_params_' + unique_model_description+ '.json')

Total dataset length: 3079
Training subset length: 2155
Validation subset length: 461
Test subset length: 463


In [15]:
print(f"Running with {torch.cuda.device_count()} GPUS")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Name is ", torch.cuda.get_device_name())

Running with 0 GPUS


In [16]:
wandb.login()
wandb.init(
    project=project_name,
    config=config,
)
early_stopping = gio.EarlyStopping(patience=early_stopping_patience, verbose=True)
gnn_instance = garch.MyGnn(in_channels=config['in_channels'], out_channels=out_channels, hidden_size=config['hidden_layer_size'], gat_layers=config['gat_layers'], gcn_layers=config['gcn_layers'], output_layer=config['output_layer'])
model = gnn_instance.to(device)
wandb.watch(model)
config = wandb.config

best_val_loss, best_epoch = garch.train(model, config=config, 
                                loss_fct=loss_fct, 
                                optimizer=torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=0.0),
                                train_dl=train_dl, valid_dl=valid_dl,
                                device=device, early_stopping=early_stopping)

model.to_onnx(path_to_save_dataloader + 'model_' + unique_model_description + '.onnx', test_dl, device)
wandb.save(path_to_save_dataloader + 'model_' + unique_model_description + '.onnx')



[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


Model initialized
MyGnn(
  (pointLayer): PointNetConv(local_nn=Sequential(
    (0): Linear(in_features=6, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=32, bias=True)
  ), global_nn=Sequential(
    (0): Linear(in_features=32, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
  ))
  (graph_layers): Sequential(
    (0) - GATConv(32, 32, heads=1): x, edge_index -> x
    (1) - ReLU(inplace=True): x -> x
    (2) - GATConv(32, 32, heads=1): x, edge_index -> x
    (3) - ReLU(inplace=True): x -> x
  )
  (output_layer): GATConv(32, 1, heads=1)
)


1it [00:07,  7.52s/it]

In [None]:
# model_path = '../../data/trained_models/model_' + unique_model_description + '.pth'

# # Save the model state dictionary and configuration
# torch.save({
#     'state_dict': model.state_dict(),
#     'config': {
#         'in_channels': model.in_channels,
#         'out_channels': model.out_channels,
#         'hidden_size': model.hidden_size,
#         'gat_layers': model.gat_layers,
#         'gcn_layers': model.gcn_layers,
#         'output_layer': model.output_layer
#     }
# }, model_path)