In [1]:
import math
import numpy as np
import wandb
import random
import torch
import torch_geometric
from torch_geometric.data import Data
import sys
import os
from tqdm import tqdm
import signal
import joblib
import argparse
import json
import os
import subprocess
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.preprocessing import StandardScaler

import help_functions as hf

# Add the 'scripts' directory to the Python path
scripts_path = os.path.abspath(os.path.join('..'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)
    
import gnn_io as gio
import gnn_architecture as garch
import psutil

def get_memory_info():
        memory_info = psutil.virtual_memory()
        total_memory = memory_info.total / (1024 ** 3)  # Convert bytes to GB
        available_memory = memory_info.available / (1024 ** 3)  # Convert bytes to GB
        used_memory = memory_info.used / (1024 ** 3)  # Convert bytes to GB
        return total_memory, available_memory, used_memory

total_memory, available_memory, used_memory = get_memory_info()
print(f"Total Memory: {total_memory:.2f} GB")
print(f"Available Memory: {available_memory:.2f} GB")
print(f"Used Memory: {used_memory:.2f} GB")


dataset_path = '../../data/train_data/sim_output_1pm_22_10_2024/'

PARAMETERS = [
    "project_name",
    "predict_mode_stats",
    "in_channels",
    "out_channels",
    "point_net_conv_layer_structure_local_mlp",
    "point_net_conv_layer_structure_global_mlp",
    "gat_conv_layer_structure",
    "num_epochs",
    "batch_size",
    "lr",
    "early_stopping_patience",
    "use_dropout",
    "dropout",
    "gradient_accumulation_steps",
    "use_gradient_clipping",
    "lr_scheduler_warmup_steps",
    "device_nr",
    "unique_model_description"
]

def get_parameters(args):
    params = {
        # KEEP IN MIND: IF WE CHANGE PARAMETERS, WE NEED TO CHANGE THE NAME OF THE RUN IN WANDB (for the config)
        "project_name": "runs_21_10_2024",
        "predict_mode_stats": args.predict_mode_stats,
        "in_channels": args.in_channels,
        "out_channels": args.out_channels,
        "point_net_conv_layer_structure_local_mlp": [int(x) for x in args.point_net_conv_layer_structure_local_mlp.split(',')],
        "point_net_conv_layer_structure_global_mlp": [int(x) for x in args.point_net_conv_layer_structure_global_mlp.split(',')],
        "gat_conv_layer_structure": [int(x) for x in args.gat_conv_layer_structure.split(',')],
        "num_epochs": args.num_epochs,
        "batch_size": int(args.batch_size),
        "lr": float(args.lr),
        "early_stopping_patience": args.early_stopping_patience,
        "use_dropout": args.use_dropout,
        "dropout": args.dropout,
        "gradient_accumulation_steps": args.gradient_accumulation_steps,
        "use_gradient_clipping": args.use_gradient_clipping,
        "lr_scheduler_warmup_steps": args.lr_scheduler_warmup_steps,
        "device_nr": args.device_nr
    }
    params["unique_model_description"] = (
        f"pnc_local_{gio.int_list_to_string(lst=params['point_net_conv_layer_structure_local_mlp'], delimiter='_')}_"
        f"pnc_global_{gio.int_list_to_string(lst=params['point_net_conv_layer_structure_global_mlp'], delimiter='_')}_"
        f"gat_conv_{gio.int_list_to_string(lst=params['gat_conv_layer_structure'], delimiter='_')}_"
        f"use_dropout_{params['use_dropout']}_"
        f"dropout_{params['dropout']}_"
        f"predict_mode_stats_{params['predict_mode_stats']}"
    )
    return params

Total Memory: 125.49 GB
Available Memory: 54.87 GB
Used Memory: 69.39 GB


In [2]:
datalist = []
batch_num = 1
while True and batch_num < 2:
    print(f"Processing batch number: {batch_num}")
    # total_memory, available_memory, used_memory = get_memory_info()
    # print(f"Total Memory: {total_memory:.2f} GB")
    # print(f"Available Memory: {available_memory:.2f} GB")
    # print(f"Used Memory: {used_memory:.2f} GB")
    batch_file = os.path.join(dataset_path, f'datalist_batch_{batch_num}.pt')
    if not os.path.exists(batch_file):
        break
    batch_data = torch.load(batch_file, map_location='cpu')
    if isinstance(batch_data, list):
        datalist.extend(batch_data)
    batch_num += 1
print(f"Loaded {len(datalist)} items into datalist")

Processing batch number: 1
Loaded 50 items into datalist


In [3]:
# Replace the argparse section with this:
args = {
    "in_channels": 13,
    "out_channels": 1,
    "predict_mode_stats": False,
    "point_net_conv_layer_structure_local_mlp": "64",
    "point_net_conv_layer_structure_global_mlp": "256",
    "gat_conv_layer_structure": "128",
    "num_epochs": 3000,
    "batch_size": 8,
    "lr": 0.001,
    "early_stopping_patience": 100,
    "use_dropout": False,
    "dropout": 0.3,
    "gradient_accumulation_steps": 3,
    "use_gradient_clipping": True,
    "lr_scheduler_warmup_steps": 10000,
    "device_nr": 0
}

# Convert the dictionary to an object with attributes
class Args:
    def __init__(self, **entries):
        self.__dict__.update(entries)

args = Args(**args)

hf.set_random_seeds()

In [4]:
gpus = hf.get_available_gpus()
best_gpu = hf.select_best_gpu(gpus)
hf.set_cuda_visible_device(best_gpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
params = get_parameters(args)

# Create base directory for the run
base_dir = '../../data/' + params['project_name'] + '/'
unique_run_dir = os.path.join(base_dir, params['unique_model_description'])
os.makedirs(unique_run_dir, exist_ok=True)

model_save_path, path_to_save_dataloader = hf.get_paths(base_dir=base_dir, unique_model_description= params['unique_model_description'], model_save_path= 'trained_model/model.pth')


Using GPU 1 with CUDA_VISIBLE_DEVICES=1


In [5]:
batch_size= params['batch_size']
path_to_save_dataloader= path_to_save_dataloader

print("Splitting into subsets...")
train_set, valid_set, test_set = gio.split_into_subsets(dataset=datalist, train_ratio=0.8, val_ratio=0.15, test_ratio=0.05)
print(f"Split complete. Train: {len(train_set)}, Valid: {len(valid_set)}, Test: {len(test_set)}")



print(len(test_set))
print("Normalizing test set...")
first_element = test_set
print(first_element)
test_set_normalized, scalers_test = hf.normalize_dataset(dataset_input=test_set, directory_path=path_to_save_dataloader + "test_")
first_element_normalized = test_set_normalized[0]
print(first_element_normalized)
# print("Test set normalized")

# print("Creating test loader...")
test_loader = DataLoader(dataset=test_set_normalized, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=gio.collate_fn, worker_init_fn=hf.seed_worker)
# print("Test loader created")

# joblib.dump(scalers_train['x_scaler'], os.path.join(path_to_save_dataloader, 'train_x_scaler.pkl'))
# joblib.dump(scalers_train['pos_scaler'], os.path.join(path_to_save_dataloader, 'train_pos_scaler.pkl'))
# joblib.dump(scalers_train['modestats_scaler'], os.path.join(path_to_save_dataloader, 'train_mode_stats_scaler.pkl'))

# joblib.dump(scalers_validation['x_scaler'], os.path.join(path_to_save_dataloader, 'validation_x_scaler.pkl'))
# joblib.dump(scalers_validation['pos_scaler'], os.path.join(path_to_save_dataloader, 'validation_pos_scaler.pkl'))
# joblib.dump(scalers_validation['modestats_scaler'], os.path.join(path_to_save_dataloader, 'validation_mode_stats_scaler.pkl'))

joblib.dump(scalers_test['x_scaler'], os.path.join(path_to_save_dataloader, 'test_x_scaler.pkl'))
# joblib.dump(scalers_test['pos_scaler'], os.path.join(path_to_save_dataloader, 'test_pos_scaler.pkl'))
# joblib.dump(scalers_test['modestats_scaler'], os.path.join(path_to_save_dataloader, 'test_mode_stats_scaler.pkl'))  

# gio.save_dataloader(test_loader, path_to_save_dataloader + 'test_dl.pt')
# gio.save_dataloader_params(test_loader, path_to_save_dataloader + 'test_loader_params.json')
# print("Dataloaders and scalers saved")
        
#         return train_loader, val_loader
    
#     except Exception as e:
#         print(f"Error in prepare_data_with_graph_features: {str(e)}")
#         import traceback
#         traceback.print_exc()
#         raise

# train_dl, valid_dl = prepare_data_with_graph_features(datalist=datalist, batch_size= params['batch_size'], path_to_save_dataloader= path_to_save_dataloader)

Splitting into subsets...
Total dataset length: 50
Training subset length: 40
Validation subset length: 7
Test subset length: 3
Split complete. Train: 40, Valid: 7, Test: 3
3
Normalizing test set...
<torch.utils.data.dataset.Subset object at 0x7efe4454c790>
Fitting and normalizing x features...


Fitting scaler: 100%|██████████| 1/1 [00:00<00:00, 40.90it/s]
Normalizing x features: 100%|██████████| 1/1 [00:00<00:00, 71.85it/s]


x features normalized
Fitting and normalizing pos features...


Fitting scaler: 100%|██████████| 1/1 [00:00<00:00, 98.96it/s]
Normalizing pos features: 100%|██████████| 1/1 [00:00<00:00, 81.84it/s]

Pos features normalized
Data(edge_index=[2, 59135], num_nodes=31140, x=[31140, 13], pos=[31140, 3, 2], y=[31140, 1], mode_stats=[6, 2])





['../../data/runs_21_10_2024/pnc_local_[64]_pnc_global_[256]_gat_conv_[128]_use_dropout_False_dropout_0.3_predict_mode_stats_False/data_created_during_training/test_x_scaler.pkl']

In [12]:
test_set[0].pos.dtype

torch.float32

In [7]:
test_set_normalized[0].x[20]

tensor([-0.3321, -0.4309, -0.3926,  0.2395,  0.0553,  1.0689,  0.7439,  0.3491,
         1.0819,  1.1068, -0.1794, -0.2104, -0.2053], dtype=torch.float64)

In [8]:
scaler_x = joblib.load(os.path.join(path_to_save_dataloader, 'test_x_scaler.pkl'))

In [9]:
scaler_x.inverse_transform(test_set_normalized[0].x)[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        4.        , 20.03709686,  0.        ,  0.        ,  1.        ,
        1.        ,  1.        ,  0.        ])

In [11]:

# config = hf.setup_wandb(params['project_name'], {param: params[param] for param in PARAMETERS})

# gnn_instance = garch.MyGnn(in_channels=config.in_channels, 
#                 out_channels=config.out_channels, 
#                 point_net_conv_layer_structure_local_mlp=config.point_net_conv_layer_structure_local_mlp,
#                 point_net_conv_layer_structure_global_mlp=config.point_net_conv_layer_structure_global_mlp,
#                 gat_conv_layer_structure=config.gat_conv_layer_structure,
#                 use_dropout=config.use_dropout, 
#                 dropout=config.dropout, 
#                 predict_mode_stats=config.predict_mode_stats)

# model = gnn_instance.to(device)
# loss_fct = torch.nn.MSELoss()

# baseline_loss_mean_target = gio.compute_baseline_of_mean_target(dataset=train_dl, loss_fct=loss_fct)
# baseline_loss = gio.compute_baseline_of_no_policies(dataset=train_dl, loss_fct=loss_fct)
# print("baseline loss mean " + str(baseline_loss_mean_target))
# print("baseline loss no  " + str(baseline_loss) )

# early_stopping = gio.EarlyStopping(patience=params['early_stopping_patience'], verbose=True)
# best_val_loss, best_epoch = garch.train(model=model, 
#             config=config, 
#             loss_fct=loss_fct,
#             optimizer=torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=1e-4),
#             train_dl=train_dl, 
#             valid_dl=valid_dl,
#             device=device, 
#             early_stopping=early_stopping,
#             model_save_path=model_save_path)
# print(f'Best model saved to {model_save_path} with validation loss: {best_val_loss} at epoch {best_epoch}')  