This notebook is used for debugging. Should have the same functionality as run_models.py, but with more verbose output.

**[TODO]** Update from run_models.py when the dust settles.

In [17]:
import os
import sys

import torch

# Add the 'scripts' directory to Python Path
scripts_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from training.help_functions import *

from gnn.help_functions import (
    GNN_Loss,
    compute_baseline_of_mean_target,
    compute_baseline_of_no_policies,
)
from gnn.models.point_net_transf_gat import PointNetTransfGAT

In [18]:
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Please adjust as needed
dataset_path = os.path.join(project_root, "data", "train_data", "node_features")
base_dir = os.path.join(project_root, "data")

In [19]:
PARAMETERS = [
    "project_name",
    "predict_mode_stats",
    "in_channels",
    "use_all_features",
    "out_channels",
    "loss_fct",
    "use_weighted_loss",
    "point_net_conv_layer_structure_local_mlp",
    "point_net_conv_layer_structure_global_mlp",
    "gat_conv_layer_structure",
    "use_bootstrapping",
    "num_epochs",
    "batch_size",
    "lr",
    "early_stopping_patience",
    "use_dropout",
    "dropout",
    "gradient_accumulation_steps",
    "use_gradient_clipping",
    "device_nr",
    "unique_model_description",
]


def get_parameters(args):

    params = {
        # KEEP IN MIND: IF WE CHANGE PARAMETERS, WE NEED TO CHANGE THE NAME OF THE RUN IN WANDB (for the config)
        "project_name": "IDP",
        "predict_mode_stats": args.predict_mode_stats,
        "in_channels": args.in_channels,
        "use_all_features": args.use_all_features,
        "out_channels": args.out_channels,
        "loss_fct": args.loss_fct,
        "use_weighted_loss": args.use_weighted_loss,
        "point_net_conv_layer_structure_local_mlp": [
            int(x) for x in args.point_net_conv_layer_structure_local_mlp.split(",")
        ],
        "point_net_conv_layer_structure_global_mlp": [
            int(x) for x in args.point_net_conv_layer_structure_global_mlp.split(",")
        ],
        "gat_conv_layer_structure": [
            int(x) for x in args.gat_conv_layer_structure.split(",")
        ],
        "use_bootstrapping": args.use_bootstrapping,
        "num_epochs": args.num_epochs,
        "batch_size": int(args.batch_size),
        "lr": float(args.lr),
        "early_stopping_patience": args.early_stopping_patience,
        "use_dropout": args.use_dropout,
        "dropout": args.dropout,
        "gradient_accumulation_steps": args.gradient_accumulation_steps,
        "use_gradient_clipping": args.use_gradient_clipping,
        "device_nr": args.device_nr,
    }

    params["unique_model_description"] = "point_net_conv_graph"
    return params

In [20]:
datalist = []
batch_num = 1
while True:  # Change this to "and batch_num < 10" for a faster run
    print(f"Processing batch number: {batch_num}")
    # total_memory, available_memory, used_memory = get_memory_info()
    # print(f"Total Memory: {total_memory:.2f} GB")
    # print(f"Available Memory: {available_memory:.2f} GB")
    # print(f"Used Memory: {used_memory:.2f} GB")
    batch_file = os.path.join(dataset_path, f"datalist_batch_{batch_num}.pt")
    if not os.path.exists(batch_file):
        break
    batch_data = torch.load(batch_file, map_location="cpu")
    if isinstance(batch_data, list):
        datalist.extend(batch_data)
    batch_num += 1
print(f"Loaded {len(datalist)} items into datalist")

Processing batch number: 1
Processing batch number: 2
Processing batch number: 3
Loaded 1000 items into datalist


In [21]:
# Replace the argparse section with this:
args = {
    "in_channels": 5,
    "use_all_features": False,
    "out_channels": 1,
    "loss_fct": "mse",
    "use_weighted_loss": False,
    "predict_mode_stats": False,
    "point_net_conv_layer_structure_local_mlp": "256",
    "point_net_conv_layer_structure_global_mlp": "512",
    "gat_conv_layer_structure": "128,256,512,256",
    "use_bootstrapping": False,
    "num_epochs": 3000,
    "batch_size": 8,
    "lr": 0.001,
    "early_stopping_patience": 100,
    "use_dropout": True,
    "dropout": 0.3,
    "gradient_accumulation_steps": 3,
    "use_gradient_clipping": True,
    "lr_scheduler_warmup_steps": 10000,
    "device_nr": 0,
}


# Convert the dictionary to an object with attributes
class Args:
    def __init__(self, **entries):
        self.__dict__.update(entries)


args = Args(**args)
set_random_seeds()

In [22]:
gpus = get_available_gpus()
best_gpu = select_best_gpu(gpus)
set_cuda_visible_device(best_gpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
params = get_parameters(args)

# Create directory for the run
unique_run_dir = os.path.join(
    base_dir, params["project_name"], params["unique_model_description"]
)
os.makedirs(unique_run_dir, exist_ok=True)

model_save_path, path_to_save_dataloader = get_paths(
    base_dir=os.path.join(base_dir, params["project_name"]),
    unique_model_description=params["unique_model_description"],
    model_save_path="trained_model/model.pth",
)
train_dl, valid_dl, scalers_train, scalers_validation = (
    prepare_data_with_graph_features(
        datalist=datalist,
        batch_size=params["batch_size"],
        path_to_save_dataloader=path_to_save_dataloader,
        use_all_features=params["use_all_features"],
        use_bootstrapping=params["use_bootstrapping"],
    )
)

config = setup_wandb({param: params[param] for param in PARAMETERS})

Using GPU 0 with CUDA_VISIBLE_DEVICES=0
Starting prepare_data_with_graph_features with 1000 items
Splitting into subsets...
Total dataset length: 1000
Training subset length: 800
Validation subset length: 150
Test subset length: 50
Split complete. Train: 800, Valid: 150, Test: 50
Normalizing train set...
Fitting and normalizing x features...


Fitting scaler: 100%|██████████| 8/8 [00:01<00:00,  4.82it/s]
Normalizing x features: 100%|██████████| 8/8 [00:00<00:00,  8.58it/s]


x features normalized
Fitting and normalizing pos features...


Fitting scaler: 100%|██████████| 1/1 [00:01<00:00,  1.79s/it]
Normalizing pos features: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]


Pos features normalized
Train set normalized
Normalizing validation set...
Fitting and normalizing x features...


Fitting scaler: 100%|██████████| 2/2 [00:00<00:00,  6.58it/s]
Normalizing x features: 100%|██████████| 2/2 [00:00<00:00, 12.48it/s]


x features normalized
Fitting and normalizing pos features...


Fitting scaler: 100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
Normalizing pos features: 100%|██████████| 1/1 [00:00<00:00,  5.35it/s]


Pos features normalized
Validation set normalized
Normalizing test set...
Fitting and normalizing x features...


Fitting scaler: 100%|██████████| 1/1 [00:00<00:00,  9.12it/s]
Normalizing x features: 100%|██████████| 1/1 [00:00<00:00, 16.37it/s]


x features normalized
Fitting and normalizing pos features...


Fitting scaler: 100%|██████████| 1/1 [00:00<00:00,  9.17it/s]
Normalizing pos features: 100%|██████████| 1/1 [00:00<00:00, 13.58it/s]


Pos features normalized
Test set normalized
Creating train loader...
Train loader created
Creating validation loader...
Validation loader created
Creating test loader...
Test loader created




Dataloaders and scalers saved


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111304037779822, max=1.0)…

In [23]:
def create_model(architecture: str, config: object, device: torch.device):
    """
    Factory function to create the specified model architecture.

    Parameters:
    - architecture: str, the name of the architecture to use
    - config: object containing model parameters
    - device: torch device to put the model on

    Returns:
    - Initialized model on the specified device
    """
    if architecture == "point_net_transf_gat":
        return PointNetTransfGAT(
            in_channels=config.in_channels,
            out_channels=config.out_channels,
            point_net_conv_layer_structure_local_mlp=config.point_net_conv_layer_structure_local_mlp,
            point_net_conv_layer_structure_global_mlp=config.point_net_conv_layer_structure_global_mlp,
            gat_conv_layer_structure=config.gat_conv_layer_structure,
            use_dropout=config.use_dropout,
            dropout=config.dropout,
            predict_mode_stats=config.predict_mode_stats,
            dtype=torch.float32,
        ).to(device)
    elif architecture == "eign":
        # TO BE IMPLEMENTED
        return Eign(
            in_channels=config.in_channels,
            out_channels=config.out_channels,
            dtype=torch.float32,
        ).to(device)
    else:
        raise ValueError(f"Unknown architecture: {architecture}")

In [24]:
gnn_instance = create_model("point_net_transf_gat", config, device)

model = gnn_instance.to(device)
loss_fct = GNN_Loss(
    config.loss_fct, datalist[0].x.shape[0], device, config.use_weighted_loss
)

baseline_loss_mean_target = compute_baseline_of_mean_target(
    dataset=train_dl, loss_fct=loss_fct, device=device, scalers=scalers_train
)
baseline_loss = compute_baseline_of_no_policies(
    dataset=train_dl, loss_fct=loss_fct, device=device, scalers=scalers_train
)
print("baseline loss mean " + str(baseline_loss_mean_target))
print("baseline loss no  " + str(baseline_loss))

early_stopping = EarlyStopping(patience=params["early_stopping_patience"], verbose=True)

baseline loss mean 114.76652526855469
baseline loss no  114.93736267089844


In [25]:
best_val_loss, best_epoch = gnn_instance.train_model(
    config=config,
    loss_fct=loss_fct,
    optimizer=torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=1e-4),
    train_dl=train_dl,
    valid_dl=valid_dl,
    device=device,
    early_stopping=early_stopping,
    model_save_path=model_save_path,
    scalers_train=scalers_train,
    scalers_validation=scalers_validation,
)

print(
    f"Best model saved to {model_save_path} with validation loss: {best_val_loss} at epoch {best_epoch}"
)

Epoch 1/3000:   0%|          | 0/100 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 248.00 MiB. GPU 0 has a total capacity of 11.64 GiB of which 31.19 MiB is free. Including non-PyTorch memory, this process has 11.54 GiB memory in use. Of the allocated memory 10.57 GiB is allocated by PyTorch, and 859.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)