In [1]:
import math
import numpy as np
import wandb
import random
import torch
import torch_geometric
from torch_geometric.data import Data
import sys
import os
from tqdm import tqdm
import signal
import joblib
import argparse
import json
import os
import subprocess
from torch.utils.data import DataLoader, Dataset, Subset


# Add the 'scripts' directory to the Python path
scripts_path = os.path.abspath(os.path.join('..'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)
    
import gnn_io as gio
import gnn_architectures as garch
    
def get_available_gpus():
    command = "nvidia-smi --query-gpu=index,utilization.gpu,memory.free --format=csv,noheader,nounits"
    result = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if result.returncode != 0:
        raise RuntimeError(f"Error executing nvidia-smi: {result.stderr.decode('utf-8')}")
    gpu_info = result.stdout.decode('utf-8').strip().split('\n')
    gpus = []
    for info in gpu_info:
        index, utilization, memory_free = info.split(', ')
        gpus.append({
            'index': int(index),
            'utilization': int(utilization),
            'memory_free': int(memory_free)
        })
    return gpus
    
def select_best_gpu(gpus):
    # Sort by free memory (descending) and then by utilization (ascending)
    gpus = sorted(gpus, key=lambda x: (-x['memory_free'], x['utilization']))
    return gpus[0]['index']

def set_cuda_visible_device(gpu_index):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_index)
    print(f"Using GPU {gpu_index} with CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}")

# Define the paths here
def get_paths(base_dir, unique_model_description):
    data_path = os.path.join(base_dir, unique_model_description)
    os.makedirs(data_path, exist_ok=True)
    model_save_path = os.path.join(data_path, 'trained_model/model.pth')
    path_to_save_dataloader = os.path.join(data_path, 'data_created_during_training/')
    os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
    os.makedirs(path_to_save_dataloader, exist_ok=True)
    data_dict_list = torch.load('../../data/train_data/dataset_1pm_0-5000.pt')
    return data_dict_list, model_save_path, path_to_save_dataloader

# Define parameters
def get_parameters(args):
        project_name = "runs_NEW"
        indices_of_datasets_to_use = [0, 1, 3, 4]
        num_epochs = 1000
        in_channels = len(indices_of_datasets_to_use) + 2
        out_channels = 1
        lr = float(args.lr)
        batch_size = int(args.batch_size)
        hidden_layers_base_for_point_net_conv = int(args.hidden_layers_base_for_point_net_conv)
        hidden_layer_structure = [int(x) for x in args.hidden_layer_structure.split(',')]
        gradient_accumulation_steps = int(args.gradient_accumulation_steps)
        early_stopping_patience = int(args.early_stopping_patience)

        unique_model_description = (
            # f"features_{gio.int_list_to_string(lst = indices_of_datasets_to_use, delimiter= '_')}_"
            # f"batch_{batch_size}_"
            f"hidden_{hidden_layers_base_for_point_net_conv}_"
            f"hidden_layer_str_{gio.int_list_to_string(lst = hidden_layer_structure, delimiter='_')}_"
            # f"gat_and_conv_structure_{gio.int_list_to_string(lst = gat_and_conv_structure, delimiter='_')}"
            # f"lr_{lr}_"
            # f"g_accumulation_steps_{gradient_accumulation_steps}"
            # f"early_stopping_{early_stopping_patience}"
            # f"in_channels_{in_channels}_"
            # f"out_channels_{out_channels}_"
        )
        return {
            "project_name": project_name,
            "indices_of_datasets_to_use": indices_of_datasets_to_use,
            "num_epochs": num_epochs,
            "batch_size": batch_size,
            "hidden_layers_base_for_point_net_conv": hidden_layers_base_for_point_net_conv,
            "hidden_layer_structure": hidden_layer_structure,
            "lr": lr,
            "gradient_accumulation_steps": gradient_accumulation_steps,
            "in_channels": in_channels,
            "out_channels": out_channels,
            "early_stopping_patience": early_stopping_patience,
            "unique_model_description": unique_model_description
        }
        
def set_random_seeds():
    torch.backends.cudnn.deterministic = True
    random.seed(hash("setting random seeds") % 2**32 - 1)
    np.random.seed(hash("improves reproducibility") % 2**32 - 1)
    torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
    torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

def setup_wandb(project_name, config):
    wandb.login()
    wandb.init(project=project_name, config=config)
    return wandb.config
        
def train_model(config, train_dl, valid_dl, device, early_stopping, checkpoint_dir, model_save_path):
    gnn_instance = garch.MyGnn(in_channels=config.in_channels, out_channels=config.out_channels, hidden_layers_base_for_point_net_conv=config.hidden_layers_base_for_point_net_conv, hidden_layer_structure=config.hidden_layer_structure)
    model = gnn_instance.to(device)
    loss_fct = torch.nn.MSELoss()
    best_val_loss, best_epoch = garch.train(model=model, 
                config=config, 
                loss_fct=loss_fct,
                optimizer=torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=0.0),
                train_dl=train_dl, 
                valid_dl=valid_dl,
                device=device, 
                early_stopping=early_stopping,
                accumulation_steps=config.gradient_accumulation_steps,
                use_existing_checkpoint=False, 
                path_existing_checkpoints=checkpoint_dir,
                compute_r_squared=False,
                model_save_path=model_save_path)
    print(f'Best model saved to {model_save_path} with validation loss: {best_val_loss} at epoch {best_epoch}')   

In [2]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Call this function during training without the scalars and with the directory path, and during the testing with the saved scalars and without a directory path to save.
def normalize_dataset_create_scaler(dataset_input, directory_path=None):
    dataset = dataset_input.copy()
    dataset, x_scaler = normalize_x_values_create_scalers(dataset, directory_path)
    dataset, pos_scaler = normalize_positional_features_create_scaler(dataset, directory_path)
    dataset, y_scaler = normalize_y_values_create_scaler(dataset, directory_path)
    return dataset, x_scaler, pos_scaler, y_scaler

def normalize_dataset_with_given_scaler(dataset_input, x_scalar_list = None, pos_scalar=None,  y_scalar=None):
    dataset = dataset_input.copy()
    dataset = normalize_x_values_given_scaler(dataset, x_scalar_list)
    dataset = normalize_positional_features_given_scaler(dataset, pos_scalar)
    dataset = normalize_y_values_given_scaler(dataset, y_scalar)
    return dataset

def normalize_x_values_given_scaler(dataset, x_scaler_list):
    shape_of_x = dataset[0].x.shape[1]
    for i in range(shape_of_x):
        scaler = x_scaler_list[i]
        for data in dataset:
            data_x_dim = replace_invalid_values(data.x[:, i].reshape(-1, 1))
            normalized_x_dim = torch.tensor(scaler.transform(data_x_dim.numpy()), dtype=torch.float)
            if i == 0:
                data.normalized_x = normalized_x_dim
            else:
                data.normalized_x = torch.cat((data.normalized_x, normalized_x_dim), dim=1)
    for data in dataset:
        data.x = data.normalized_x
        del data.normalized_x
    return dataset

def normalize_positional_features_given_scaler(dataset, pos_scalar=None):
    for data in dataset:
        data.pos = torch.tensor(pos_scalar.transform(data.pos.numpy()), dtype=torch.float)
    return dataset

def normalize_y_values_given_scaler(dataset, y_scalar=None):
    for data in dataset:
        data.y = torch.tensor(y_scalar.transform(data.y.numpy()), dtype=torch.float)
    return dataset

def normalize_x_values_create_scalers(dataset, directory_path):
    shape_of_x = dataset[0].x.shape[1]
    list_of_scalers_to_save = []
    x_values = torch.cat([data.x for data in dataset], dim=0)

    for i in range(shape_of_x):
        all_node_features = replace_invalid_values(x_values[:, i].reshape(-1, 1)).numpy()
        
        scaler = StandardScaler()
        print(f"Scaler created for x values at index {i}: {scaler}")
        scaler.fit(all_node_features)
        list_of_scalers_to_save.append(scaler)

        for data in dataset:
            data_x_dim = replace_invalid_values(data.x[:, i].reshape(-1, 1))
            normalized_x_dim = torch.tensor(scaler.transform(data_x_dim.numpy()), dtype=torch.float)
            if i == 0:
                data.normalized_x = normalized_x_dim
            else:
                data.normalized_x = torch.cat((data.normalized_x, normalized_x_dim), dim=1)

    joblib.dump(list_of_scalers_to_save, os.path.join(directory_path, 'x_scaler.pkl'))

    for data in dataset:
        data.x = data.normalized_x
        del data.normalized_x
    return dataset, list_of_scalers_to_save

def normalize_positional_features_create_scaler(dataset, directory_path):
    all_pos_features = torch.cat([data.pos for data in dataset], dim=0)
    all_pos_features = replace_invalid_values(all_pos_features).numpy()
    scaler = StandardScaler()
    print(f"Scaler created for pos features: {scaler}")
    scaler.fit(all_pos_features)
    joblib.dump(scaler, os.path.join(directory_path, 'pos_scaler.pkl'))
    for data in dataset:
        data.pos = torch.tensor(scaler.transform(data.pos.numpy()), dtype=torch.float)
    return dataset, scaler


def normalize_y_values_create_scaler(dataset, directory_path):
    all_y_values = torch.cat([data.y for data in dataset], dim=0).reshape(-1, 1)
    all_y_values = replace_invalid_values(all_y_values).numpy()

    scaler = MinMaxScaler()
    print(f"Scaler created for y values: {scaler}")
    scaler.fit(all_y_values)
    joblib.dump(scaler, os.path.join(directory_path, 'y_scaler.pkl'))

    for data in dataset:
        data.y = torch.tensor(scaler.transform(data.y.reshape(-1, 1).numpy()), dtype=torch.float)
    return dataset, scaler

def replace_invalid_values(tensor):
    tensor[tensor != tensor] = 0  # replace NaNs with 0
    tensor[tensor == float('inf')] = 0  # replace inf with 0
    tensor[tensor == float('-inf')] = 0  # replace -inf with 0
    return tensor

In [3]:
data_dict_list = torch.load('../../data/train_data/dataset_1pm_0-5000.pt')

set_random_seeds()

# Create base directory for the run
base_dir = '../../data/runs_NEW/'
unique_run_dir = os.path.join(base_dir, "this_IS")
os.makedirs(unique_run_dir, exist_ok=True)

data_dict_list, model_save_path, path_to_save_dataloader = get_paths(base_dir, "this_IS")

indices_of_datasets_to_use=[0,1,3,4]
batch_size= 8
path_to_save_dataloader= path_to_save_dataloader

In [4]:
compute_r_2 = True

if compute_r_2:
    print("OK")

OK


In [5]:
# def prepare_data(data_dict_list, indices_of_datasets_to_use, batch_size, path_to_save_dataloader):
indices_of_datasets_to_use=[0,1,3,4]
batch_size= 8
path_to_save_dataloader= path_to_save_dataloader

def prepare_data(data_dict_list, indices_of_datasets_to_use, batch_size, path_to_save_dataloader, normalize_y, normalize_pos):
    datalist = [Data(x=d['x'], edge_index=d['edge_index'], pos=d['pos'], y=d['y']) for d in data_dict_list]
    dataset_only_relevant_dimensions = gio.cut_dimensions(dataset=datalist, indices_of_dimensions_to_keep=indices_of_datasets_to_use)
    train_set, valid_set, test_set = gio.split_into_subsets(dataset=dataset_only_relevant_dimensions, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)
    # print(train_set[0].x[:10])
    # print(train_set[0].y[:10])
    if normalize_y and normalize_pos:
        train_set_normalized, x_scaler, pos_scaler, y_scaler = gio.normalize_dataset_create_scaler(dataset_input = train_set, directory_path=path_to_save_dataloader, normalize_y=True, normalize_pos=True)
        valid_set_normalized = gio.normalize_dataset_with_given_scaler(dataset_input=valid_set, x_scalar_list=x_scaler, pos_scalar=pos_scaler, y_scalar=y_scaler, normalize_y=True, normalize_pos=True)
        test_set_normalized =  gio.normalize_dataset_with_given_scaler(dataset_input=test_set, x_scalar_list=x_scaler, pos_scalar=pos_scaler, y_scalar=y_scaler, normalize_y=True, normalize_pos=True) 
    if normalize_y and not normalize_pos:
        train_set_normalized, x_scaler, y_scaler = gio.normalize_dataset_create_scaler(dataset_input = train_set, directory_path=path_to_save_dataloader, normalize_y=True, normalize_pos=False)
        valid_set_normalized = gio.normalize_dataset_with_given_scaler(dataset_input=valid_set, x_scalar_list=x_scaler, pos_scalar=None, y_scalar=y_scaler, normalize_y=True, normalize_pos=False)
        test_set_normalized =  gio.normalize_dataset_with_given_scaler(dataset_input=test_set, x_scalar_list=x_scaler, pos_scalar=None, y_scalar=y_scaler, normalize_y=True, normalize_pos=False) 
    if not normalize_y and normalize_pos:
        train_set_normalized, x_scaler, pos_scaler = gio.normalize_dataset_create_scaler(dataset_input = train_set, directory_path=path_to_save_dataloader, normalize_y=False, normalize_pos=True)
        valid_set_normalized = gio.normalize_dataset_with_given_scaler(dataset_input=valid_set, x_scalar_list=x_scaler, pos_scalar=pos_scaler, y_scalar= None,normalize_y=False, normalize_pos=True)
        test_set_normalized =  gio.normalize_dataset_with_given_scaler(dataset_input=test_set, x_scalar_list=x_scaler, pos_scalar=pos_scaler,y_scalar=None, normalize_y=False, normalize_pos=True)
    if not normalize_y and not normalize_pos:
        train_set_normalized, x_scaler = gio.normalize_dataset_create_scaler(dataset_input = train_set, directory_path=path_to_save_dataloader, normalize_y=False, normalize_pos=False)
        valid_set_normalized = gio.normalize_dataset_with_given_scaler(dataset_input=valid_set, x_scalar_list=x_scaler, pos_scalar=None, y_scalar= None, normalize_y=False, normalize_pos=False)
        test_set_normalized =  gio.normalize_dataset_with_given_scaler(dataset_input=test_set, x_scalar_list=x_scaler, pos_scalar=None, y_scalar=None, normalize_y=False, normalize_pos=False)
      
    train_loader = DataLoader(dataset=train_set_normalized, batch_size=batch_size, shuffle=True, num_workers=4, prefetch_factor=2, pin_memory=True, collate_fn=gio.collate_fn, worker_init_fn=seed_worker)
    val_loader = DataLoader(dataset=valid_set_normalized, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, collate_fn=gio.collate_fn, worker_init_fn=seed_worker)
    test_loader = DataLoader(dataset=test_set_normalized, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=gio.collate_fn, worker_init_fn=seed_worker)
    gio.save_dataloader(test_loader, path_to_save_dataloader + 'test_dl.pt')
    gio.save_dataloader_params(test_loader, path_to_save_dataloader + 'test_loader_params.json')
    return train_loader, val_loader

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

train_dl, valid_dl = prepare_data(data_dict_list=data_dict_list, indices_of_datasets_to_use=indices_of_datasets_to_use, batch_size= batch_size, path_to_save_dataloader= path_to_save_dataloader, normalize_y=False, normalize_pos=True)

Total dataset length: 4887
Training subset length: 3420
Validation subset length: 733
Test subset length: 734
Scaler created for x values at index 0: StandardScaler()
Scaler created for x values at index 1: StandardScaler()
Scaler created for x values at index 2: StandardScaler()
Scaler created for x values at index 3: StandardScaler()
Scaler created for pos features: StandardScaler()


In [6]:
config = setup_wandb("test", {
    "epochs": 1000,
    "batch_size": 8,
    "lr": 0.001,
    "gradient_accumulation_steps": 5,
    "early_stopping_patience": 20,
    "point_net_conv_local_mlp": [64],
    "point_net_conv_global_mlp": [64,32,128],
    "gat_conv_layer_structure": [64],
    "indices_to_use": [0,1,3,4],
    "in_channels": 6,
    "out_channels": 1,
    "dropout": 0.3
})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gnn_instance = garch.MyGnn(in_channels=config.in_channels, out_channels=config.out_channels, point_net_conv_layer_structure_local_mlp=config.point_net_conv_local_mlp,
                                   point_net_conv_layer_structure_global_mlp=config.point_net_conv_global_mlp,
                                   gat_conv_layer_structure=config.gat_conv_layer_structure,
                                   dropout=config.dropout, use_dropout=False)
model = gnn_instance.to(device)
loss_fct = torch.nn.MSELoss()

baseline_loss_mean_target = gio.compute_baseline_of_mean_target(dataset=train_dl, loss_fct=loss_fct)
baseline_loss = gio.compute_baseline_of_no_policies(dataset=train_dl, loss_fct=loss_fct)
print("baseline loss mean " + str(baseline_loss_mean_target))
print("baseline loss no  " +str(baseline_loss) )

early_stopping = gio.EarlyStopping(patience=20, verbose=True)
best_val_loss, best_epoch = garch.train(model=model, 
            config=config, 
            loss_fct=loss_fct,
            optimizer=torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=1e-4),
            train_dl=train_dl, 
            valid_dl=valid_dl,
            device=device, 
            early_stopping=early_stopping,
            accumulation_steps=config.gradient_accumulation_steps,
            compute_r_squared=False,
            model_save_path=model_save_path)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33menatterer[0m ([33mtum-traffic-engineering[0m). Use [1m`wandb login --relogin`[0m to force relogin


Initializing PointNetConv(local_nn=Sequential(
  (0): Linear(in_features=6, out_features=64, bias=True)
  (1): ReLU()
), global_nn=Sequential(
  (0): Linear(in_features=64, out_features=64, bias=True)
  (1): Linear(in_features=64, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=128, bias=True)
  (4): ReLU()
  (5): Linear(in_features=128, out_features=64, bias=True)
  (6): ReLU()
))
Initializing 0.weight with kaiming_normal
Initializing 0.bias with zeros
Initializing 0.weight with kaiming_normal
Initializing 0.bias with zeros
Initializing 1.weight with kaiming_normal
Initializing 1.bias with zeros
Initializing 3.weight with kaiming_normal
Initializing 3.bias with zeros
Initializing 5.weight with kaiming_normal
Initializing 5.bias with zeros
Initializing Linear(in_features=6, out_features=64, bias=True)
Initializing Linear(in_features=64, out_features=64, bias=True)
Initializing Linear(in_features=64, out_features=32, bias=True)
Initializing Linear(in

Epoch 1/1000: 100%|██████████| 428/428 [00:07<00:00, 54.37it/s]


KeyboardInterrupt: 

: 

In [None]:
train_set[0].x[:10], train_set[0].y[:10], train_set[0].pos[:10]

(tensor([[   7.0741,  480.0000,    0.0000,    4.0000],
         [   9.1481,  480.0000, -240.0000,    3.0000],
         [   2.0000,  960.0000, -480.0000,    3.0000],
         [   7.1852,  960.0000,    0.0000,    4.0000],
         [   8.1111,  480.0000,    0.0000,    4.0000],
         [   5.4444,  480.0000,    0.0000,    4.0000],
         [   5.4444,  480.0000,    0.0000,    4.0000],
         [   0.0000,  240.0000,    0.0000,    5.0000],
         [  17.1111,  480.0000, -240.0000,    3.0000],
         [   6.7037,  480.0000,    0.0000,    4.0000]]),
 tensor([[ 0.9259],
         [-0.1481],
         [ 2.0000],
         [ 3.8148],
         [ 2.8889],
         [-1.4444],
         [-1.4444],
         [ 0.0000],
         [-0.1111],
         [-2.7037]]),
 tensor([[ 2.3386, 48.8518],
         [ 2.3387, 48.8524],
         [ 2.3387, 48.8524],
         [ 2.3399, 48.8519],
         [ 2.3395, 48.8517],
         [ 2.3426, 48.8503],
         [ 2.3413, 48.8501],
         [ 2.3421, 48.8509],
         [ 2.3

In [None]:
train_dl.dataset[0].x[:10], train_dl.dataset[0].y[:10], train_dl.dataset[0].pos[:10]

(tensor([[ 0.0795, -0.5715,  0.3782,  0.5976],
         [ 0.2217, -0.5715, -0.1847,  0.1290],
         [-0.2682, -0.3785, -0.7476,  0.1290],
         [ 0.0871, -0.3785,  0.3782,  0.5976],
         [ 0.1506, -0.5715,  0.3782,  0.5976],
         [-0.0321, -0.5715,  0.3782,  0.5976],
         [-0.0321, -0.5715,  0.3782,  0.5976],
         [-0.4052, -0.6681,  0.3782,  1.0662],
         [ 0.7673, -0.5715, -0.1847,  0.1290],
         [ 0.0541, -0.5715,  0.3782,  0.5976]]),
 tensor([[ 0.9259],
         [-0.1481],
         [ 2.0000],
         [ 3.8148],
         [ 2.8889],
         [-1.4444],
         [-1.4444],
         [ 0.0000],
         [-0.1111],
         [-2.7037]]),
 tensor([[-0.0564, -0.2530],
         [-0.0531, -0.2299],
         [-0.0531, -0.2299],
         [-0.0282, -0.2489],
         [-0.0372, -0.2592],
         [ 0.0297, -0.3170],
         [ 0.0014, -0.3249],
         [ 0.0193, -0.2924],
         [-0.0594, -0.2548],
         [-0.0636, -0.3185]]))

In [None]:
loss_fct = torch.nn.MSELoss()

baseline_loss_mean_target = gio.compute_baseline_of_mean_target(dataset=train_dl, loss_fct=loss_fct)
baseline_loss = gio.compute_baseline_of_no_policies(dataset=train_dl, loss_fct=loss_fct)
print("baseline loss mean " + str(baseline_loss_mean_target))
print("baseline loss no  " +str(baseline_loss) )

baseline loss mean 3.9451329708099365
baseline loss no  3.9662485122680664


In [None]:
for split in [train_set, valid_set, test_set]:
    max = 0
    min = 99
    for i in range(len(split)):
        if split[i].y.max() > max:
            max = split[i].y.max()
            i_max = i
        if split[i].y.min() < min:
            min = split[i].y.min()
            i_min = i
            
    print(split, min, max, i_max, i_min)

<torch.utils.data.dataset.Subset object at 0x7ff650ea6e30> tensor(-31.2963) tensor(50.9630) 3222 851
<torch.utils.data.dataset.Subset object at 0x7ff650ea6e90> tensor(-27.) tensor(31.5185) 692 505
<torch.utils.data.dataset.Subset object at 0x7ff650ea6ef0> tensor(-27.7037) tensor(29.5185) 536 77


In [None]:
(train_set[1].y == train_set[2].y).all()

tensor(False)

: 