In [1]:
import math
import numpy as np
import wandb

import torch
import torch_geometric
from torch_geometric.data import Data

import sys
import os
from tqdm import tqdm

# Add the 'scripts' directory to the Python path
scripts_path = os.path.abspath(os.path.join('..'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)
    
import joblib

# Now you can import the gnn_io module
import gnn_io as gio

import gnn_architectures as garch

## 1. Define model and parameters

In [2]:
# Define parameters 
num_epochs = 1000
unique_model_description = "mse_loss"
project_name = "try_overfitting"
path_to_save_dataloader = "../../data/data_created_during_training_needed_for_testing/"
indices_of_datasets_to_use = [0, 1, 2, 3]

loss_fct = torch.nn.MSELoss()
batch_size = 16
output_layer_parameter = 'gat'
hidden_size_parameter = 128
gat_layer_parameter = 5
gcn_layer_parameter = 0
lr = 0.001
in_channels = len(indices_of_datasets_to_use) + 2 # dimensions of the x vector + 2 (pos)
out_channels = 1 # we are predicting one value
early_stopping_patience = 10

data_dict_list = torch.load('../../data/train_data/dataset_1pm_0-3100.pt')

## 2. Load data

In [3]:
# Reconstruct the Data objects
datalist = [Data(x=d['x'], edge_index=d['edge_index'], pos=d['pos'], y=d['y']) for d in data_dict_list]
# dataset_only_relevant_dimensions = gio.cut_dimensions(dataset=datalist, indices_of_dimensions_to_keep=indices_of_datasets_to_use)

In [4]:
datalist[0].pos

tensor([[ 2.3386, 48.8518],
        [ 2.3387, 48.8524],
        [ 2.3387, 48.8524],
        ...,
        [ 2.3143, 48.8912],
        [ 2.2712, 48.8380],
        [ 2.2750, 48.8370]])

In [5]:
dataset_normalized = gio.normalize_dataset(datalist, y_scalar=None, x_scalar_list=None, pos_scalar=None, directory_path=path_to_save_dataloader)

In [6]:
asdfasdf

NameError: name 'asdfasdf' is not defined

In [None]:
baseline_error = gio.compute_baseline_of_no_policies(dataset=dataset_normalized, loss_fct=loss_fct)
print(f'Baseline error no policies: {baseline_error}')

baseline_error = gio.compute_baseline_of_mean_target(dataset=dataset_normalized, loss_fct=loss_fct)
print(f'Baseline error mean: {baseline_error}')

Baseline error no policies: 0.3216273784637451
Baseline error mean: 0.0032576550729572773


## 4. Train the model

We first find a good model for one batch. 

In [None]:
train_dl, valid_dl, test_dl = gio.create_dataloaders(batch_size = batch_size, dataset=dataset_normalized, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)
gio.save_dataloader(test_dl, path_to_save_dataloader + 'test_dl_' + unique_model_description + '.pt')
gio.save_dataloader_params(test_dl, path_to_save_dataloader + 'test_loader_params_' + unique_model_description+ '.json')

Total dataset length: 3079
Training subset length: 2155
Validation subset length: 461
Test subset length: 463


In [None]:
print(f"Running with {torch.cuda.device_count()} GPUS")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Name is ", torch.cuda.get_device_name())

Running with 2 GPUS
Name is  NVIDIA RTX A5000


In [None]:
wandb.login()
wandb.init(
    project=project_name,
    config={
        "epochs": num_epochs,
        "batch_size": batch_size,
        "lr": lr,
        "early_stopping_patience": 10,
        "hidden_layer_size": hidden_size_parameter,
        "gat_layers": gat_layer_parameter,
        "gcn_layers": gcn_layer_parameter,
        "output_layer": output_layer_parameter,
        # "dropout": 0.15,
    }
)
config = wandb.config

print("output_layer: ", output_layer_parameter)
print("hidden_size: ", hidden_size_parameter)
print("gat_layers: ", gat_layer_parameter)
print("gcn_layers: ", gcn_layer_parameter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
early_stopping = gio.EarlyStopping(patience=early_stopping_patience, verbose=True)
# torch.set_printoptions(precision=4, sci_mode=False)

gnn_instance = garch.MyGnn(in_channels=in_channels, out_channels=out_channels, hidden_size=hidden_size_parameter, gat_layers=gat_layer_parameter, gcn_layers=gcn_layer_parameter, output_layer=output_layer_parameter)
model = gnn_instance.to(device)

best_val_loss, best_epoch = garch.train(model, config=config, 
                                loss_fct=loss_fct, 
                                optimizer=torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.0),
                                train_dl=train_dl, valid_dl=valid_dl,
                                device=device, early_stopping=early_stopping)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33menatterer[0m ([33mtum-traffic-engineering[0m). Use [1m`wandb login --relogin`[0m to force relogin


output_layer:  gat
hidden_size:  128
gat_layers:  5
gcn_layers:  0
Model initialized
MyGnn(
  (pointLayer): PointNetConv(local_nn=Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
  ), global_nn=Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=128, bias=True)
  ))
  (graph_layers): Sequential(
    (0) - GATConv(128, 128, heads=1): x, edge_index -> x
    (1) - ReLU(inplace=True): x -> x
    (2) - GATConv(128, 128, heads=1): x, edge_index -> x
    (3) - ReLU(inplace=True): x -> x
    (4) - GATConv(128, 128, heads=1): x, edge_index -> x
    (5) - ReLU(inplace=True): x -> x
    (6) - GATConv(128, 128, heads=1): x, edge_index -> x
    (7) - ReLU(inplace=True): x -> x
    (8) - GATConv(128, 128, heads=1): x, edge_index -> x
    (9) -

135it [00:55,  2.45it/s]


epoch: 0, validation loss: 0.0032588329551548794


135it [00:54,  2.46it/s]


epoch: 1, validation loss: 0.003256622003391385


135it [00:55,  2.43it/s]


epoch: 2, validation loss: 0.003254267852753401


135it [00:55,  2.41it/s]


epoch: 3, validation loss: 0.00324024329893291


135it [00:56,  2.41it/s]


epoch: 4, validation loss: 0.003225539345294237


135it [00:55,  2.41it/s]


epoch: 5, validation loss: 0.0032112430700839594


135it [00:56,  2.41it/s]


epoch: 6, validation loss: 0.0031994767406762674


135it [00:55,  2.42it/s]


epoch: 7, validation loss: 0.0032200186898739174
EarlyStopping counter: 1 out of 10


135it [00:54,  2.46it/s]


epoch: 8, validation loss: 0.0032016788689612314
EarlyStopping counter: 2 out of 10


135it [00:55,  2.45it/s]


epoch: 9, validation loss: 0.0031962605225371904


135it [00:55,  2.42it/s]


epoch: 10, validation loss: 0.0031851678203534462


135it [00:55,  2.42it/s]


epoch: 11, validation loss: 0.0031651213347654917


135it [00:55,  2.42it/s]


epoch: 12, validation loss: 0.0032013689633458853
EarlyStopping counter: 1 out of 10


135it [00:56,  2.41it/s]


epoch: 13, validation loss: 0.003147430019453168


135it [00:55,  2.42it/s]


epoch: 14, validation loss: 0.0031355712402226597


135it [00:55,  2.42it/s]


epoch: 15, validation loss: 0.0031265604133107537


135it [00:55,  2.41it/s]


KeyboardInterrupt: 

In [None]:
# model_path = '../../data/trained_models/model_' + unique_model_description + '.pth'

# # Save the model state dictionary and configuration
# torch.save({
#     'state_dict': model.state_dict(),
#     'config': {
#         'in_channels': model.in_channels,
#         'out_channels': model.out_channels,
#         'hidden_size': model.hidden_size,
#         'gat_layers': model.gat_layers,
#         'gcn_layers': model.gcn_layers,
#         'output_layer': model.output_layer
#     }
# }, model_path)