In [None]:
import math
import numpy as np
import wandb
import random
import torch
import torch_geometric
from torch_geometric.data import Data
import sys
import os
from tqdm import tqdm
import signal
import joblib
import argparse
import json
import os
import subprocess
from torch.utils.data import DataLoader, Dataset, Subset
import help_functions as hf

scripts_path = os.path.abspath(os.path.join('..'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)
    
import gnn_io as gio
import gnn_architectures as garch

This notebook can be used for debugging - the code is analogous to run_models.py

In [2]:
# Set parameters here
params = {"project_name": "test",
            "indices_of_datasets_to_use": [0,1,3,4],
            "num_epochs": 1000,
            "batch_size": 8,
            "point_net_conv_layer_structure_local_mlp": [64],
            "point_net_conv_layer_structure_global_mlp": [64],
            "gat_conv_layer_structure": [64, 128],
            "lr": 0.001,
            "gradient_accumulation_steps": 3,
            "in_channels": 6,
            "out_channels": 1,
            "early_stopping_patience": 20,
            "unique_model_description": "my_test",
            "dropout": 0.3,
            "use_dropout": False
        } 

In [3]:
gpus = hf.get_available_gpus()
best_gpu = hf.select_best_gpu(gpus)
hf.set_cuda_visible_device(best_gpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create base directory for the run
base_dir = '../../data/' + params['project_name'] + '/'
unique_run_dir = os.path.join(base_dir, params['unique_model_description'])
os.makedirs(unique_run_dir, exist_ok=True)
dataset_path = '../../data/train_data/sim_output_1pm_capacity_reduction_10k_PRELIMINARY.pt'

data_dict_list, model_save_path, path_to_save_dataloader = hf.get_paths(base_dir=base_dir, unique_model_description= params['unique_model_description'], model_save_path= 'trained_model/model.pth', dataset_path=dataset_path)
train_dl, valid_dl = hf.prepare_data(data_dict_list=data_dict_list, indices_of_datasets_to_use=params['indices_of_datasets_to_use'], batch_size= params['batch_size'], path_to_save_dataloader= path_to_save_dataloader, normalize_y=False, normalize_pos=True)

config = hf.setup_wandb(params['project_name'], {
    "epochs": params['num_epochs'],
    "batch_size": params['batch_size'],
    "lr": params['lr'],
    "gradient_accumulation_steps": params['gradient_accumulation_steps'],
    "early_stopping_patience": params['early_stopping_patience'],
    "point_net_conv_local_mlp": params['point_net_conv_layer_structure_local_mlp'],
    "point_net_conv_global_mlp": params['point_net_conv_layer_structure_global_mlp'],
    "gat_conv_layer_structure": params['gat_conv_layer_structure'],
    "indices_to_use": params['indices_of_datasets_to_use'],
    "in_channels": params['in_channels'],
    "out_channels": params['out_channels'],
    "dropout": params['dropout'],
    "use_dropout": params['use_dropout']
})

gnn_instance = garch.MyGnn(in_channels=config.in_channels, out_channels=config.out_channels, point_net_conv_layer_structure_local_mlp=config.point_net_conv_local_mlp,
                            point_net_conv_layer_structure_global_mlp=config.point_net_conv_global_mlp,
                            gat_conv_layer_structure=config.gat_conv_layer_structure,
                            dropout=config.dropout, use_dropout=config.use_dropout)
model = gnn_instance.to(device)
loss_fct = torch.nn.MSELoss()

baseline_loss_mean_target = gio.compute_baseline_of_mean_target(dataset=train_dl, loss_fct=loss_fct)
baseline_loss = gio.compute_baseline_of_no_policies(dataset=train_dl, loss_fct=loss_fct)
print("baseline loss mean " + str(baseline_loss_mean_target))
print("baseline loss no  " +str(baseline_loss) )

early_stopping = gio.EarlyStopping(patience=params['early_stopping_patience'], verbose=True)
best_val_loss, best_epoch = garch.train(model=model, 
            config=config, 
            loss_fct=loss_fct,
            optimizer=torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=1e-4),
            train_dl=train_dl, 
            valid_dl=valid_dl,
            device=device, 
            early_stopping=early_stopping,
            accumulation_steps=config.gradient_accumulation_steps,
            model_save_path=model_save_path,
            use_gradient_clipping=True,
            lr_scheduler_warmup_steps=20000,
            lr_scheduler_cosine_decay_rate=0.2)
print(f'Best model saved to {model_save_path} with validation loss: {best_val_loss} at epoch {best_epoch}')  

Using GPU 0 with CUDA_VISIBLE_DEVICES=0
Total dataset length: 4887
Training subset length: 3909
Validation subset length: 733
Test subset length: 245
Scaler created for x values at index 0: StandardScaler()
Scaler created for x values at index 1: StandardScaler()
Scaler created for x values at index 2: StandardScaler()
Scaler created for x values at index 3: StandardScaler()
Scaler created for pos features: StandardScaler()


: 

: 