# Debugging cuda issues

In [1]:
from typing import Any

from gnn_tracking.models.mlp import MLP
from gnn_tracking.training.tcn_trainer import TCNTrainer
from gnn_tracking.utils.graph_masks import edge_subgraph
from gnn_tracking_hpo.util.paths import add_scripts_path

from gnn_tracking.utils.loading import TrackingDataset
from torch_geometric.loader import DataLoader
import torch
from tqdm import tqdm
add_scripts_path()
from gnn_tracking_hpo.util.paths import get_config, find_checkpoint
from gnn_tracking_hpo.trainable import legacy_config_compatibility

from gnn_tracking.utils.dictionaries import subdict_with_prefix_stripped
from gnn_tracking_hpo.trainable import TCNTrainable
from torch import nn

In [2]:
from tune_ec import ECTrainable

## Load model

In [3]:
project = "ec"
hash = "a94b24d1"
epoch=-1
config = legacy_config_compatibility(get_config(project, hash))
trainable = ECTrainable(config)
#trainable.load_checkpoint(str(find_checkpoint(project, hash, epoch)), device="cuda")

[36m[19:11:52 gnnt_hpo] DEBUG: Loading config from /home/kl5675/ray_results/ec/ECTrainable_a94b24d1_1_val_batch_size=5,adam_amsgrad=False,adam_beta1=0.9000,adam_beta2=0.9990,adam_eps=0.0000,adam_weight_decay=0_2023-04-12_22-10-45/params.json[0m
[32m[19:11:52 gnnt_hpo] INFO: I'm running on a node with job ID=47034685[0m
[32m[19:11:52 gnnt_hpo] INFO: The ID of my dispatcher is 0[0m
[36m[19:11:52 SlurmControl] DEBUG: Refreshing control config from /home/kl5675/ray_slurm_control.yaml[0m
[36m[19:11:52 gnnt_hpo] DEBUG: Got config
┌──────────────────────────────────────┬──────────────────────────────────────────┐
│ _val_batch_size                      │ 5                                        │
│ adam_amsgrad                         │ False                                    │
│ adam_beta1                           │ 0.9                                      │
│ adam_beta2                           │ 0.999                                    │
│ adam_eps                             │ 

In [4]:
ec = trainable.trainer.model

In [5]:
from gnn_tracking.models.resin import ResIN
from torch_geometric.utils import index_to_mask
from torch import nn, Tensor
from torch_geometric.data import Data


class ModularGraphTCN(nn.Module):
    def __init__(
        self,
        *,
        ec: nn.Module,
        hc_in: nn.Module,
        node_indim: int,
        edge_indim: int,
        h_dim=5,
        e_dim=4,
        h_outdim=2,
        hidden_dim=40,
        feed_edge_weights=False,
        ec_threshold=0.5,
        mask_orphan_nodes=False,
        use_ec_embeddings_for_hc=False,
    ):
        """General form of track condensation network based on preconstructed graphs
        with initial step of edge classification (passed as a parameter).

        Args:
            ec: Edge classifier
            hc_in: Track condensor interaction network.
            node_indim: Node feature dimension
            edge_indim: Edge feature dimension
            h_dim: node dimension in the condensation interaction networks
            e_dim: edge dimension in the condensation interaction networks
            h_outdim: output dimension in clustering space
            hidden_dim: width of hidden layers in all perceptrons
            feed_edge_weights: whether to feed edge weights to the track condenser
            ec_threshold: threshold for edge classification
            mask_orphan_nodes: Mask nodes with no connections after EC
            use_ec_embeddings_for_hc: Use edge classifier embeddings as input to
                track condenser. This currently assumes that h_dim and e_dim are
                also the dimensions used in the EC.
        """
        super().__init__()
        self.relu = nn.ReLU()

        #: Edge classification network
        self.ec = ec
        #: Track condensation network (usually made up of interaction networks)
        self.hc_in = hc_in

        node_enc_indim = node_indim
        # edge_enc_indim = edge_indim
        # if use_ec_embeddings_for_hc:
        #     node_enc_indim += h_dim
        #     edge_enc_indim += e_dim
        # edge_enc_indim += int(feed_edge_weights)

        #: Node encoder network for track condenser
        self.hc_node_encoder = MLP(
            node_enc_indim, h_dim, hidden_dim=hidden_dim, L=2, bias=False
        )
        #: Edge encoder network for track condenser
        # self.hc_edge_encoder = MLP(
        #     edge_enc_indim,
        #     e_dim,
        #     hidden_dim=hidden_dim,
        #     L=2,
        #     bias=False,
        # )

        #: NN to predict beta
        self.p_beta = MLP(h_dim, 1, hidden_dim, L=3)
        #: NN to predict cluster coordinates
        self.p_cluster = MLP(h_dim, h_outdim, hidden_dim, L=3)
        #: NN to predict track parameters
        # self.p_track_param = IN(
        #     node_indim=h_dim,
        #     edge_indim=e_dim + hc_in.length_concatenated_edge_attrs,
        #     node_outdim=1,
        #     edge_outdim=1,
        #     node_hidden_dim=hidden_dim,
        #     edge_hidden_dim=hidden_dim,
        # )
        # self._feed_edge_weights = feed_edge_weights
        # self.threshold = ec_threshold
        # self._mask_orphan_nodes = mask_orphan_nodes
        # self._use_ec_embeddings_for_hc = use_ec_embeddings_for_hc

    def forward(
        self,
        data: Data,
    ) -> dict[str, Tensor]:
        # ec_result = self.ec(data)
        # Assign all EC  output to the data object, so that the cuts
        # will be applied automatically when we call `data.subgraph(...)` etc.
        # data.edge_weights = ec_result["W"]
        # data.ec_node_embedding = ec_result.get("node_embedding", None)
        # data.ec_edge_embedding = ec_result.get("edge_embedding", None)
        # edge_weights_unmasked = data.edge_weights.clone().detach()
        # edge_mask = (data.edge_weights > self.threshold).squeeze()
        # data = edge_subgraph(data, edge_mask)

        # if self._mask_orphan_nodes:
        #     connected_nodes = data.edge_index.flatten().unique()
        #     hit_mask = index_to_mask(connected_nodes, size=data.num_nodes)
        #     data = data.subgraph(connected_nodes)
        # else:
        #     hit_mask = torch.ones(
        #         data.num_nodes, dtype=torch.bool, device=data.x.device
        #     )

        # Get the encoded inputs for the track condenser
        _edge_attrs = [data.edge_attr]
        _xs = [data.x]
        # if self._use_ec_embeddings_for_hc:
        #     assert data.ec_edge_embedding is not None
        #     assert data.ec_node_embedding is not None
        #     _edge_attrs.append(data.ec_edge_embedding)
        #     _xs.append(data.ec_node_embedding)
        # if self._feed_edge_weights:
        #     _edge_attrs.append(data.edge_weights)
        h_hc = self.relu(self.hc_node_encoder(_xs[0]))
        # edge_attr_hc = self.relu(self.hc_edge_encoder(_edge_attrs[0]))

        # Run the track condenser
        # h_hc, _, _ = self.hc_in(h_hc, data.edge_index, edge_attr_hc)
        beta = self.p_beta(h_hc)
        # protect against nans
        # beta = beta + torch.ones_like(beta) * 10e-9

        h = self.p_cluster(h_hc)
        # track_params, _ = self.p_track_param(
        #     h_hc, data.edge_index, torch.cat(edge_attrs_hc, dim=1)
        # )
        return {
            # "W": edge_weights_unmasked,
            "H": h,
            "B": beta,
            # "ec_hit_mask": hit_mask,
            # "ec_edge_mask": edge_mask,
        }



class PreTrainedECGraphTCN(nn.Module):
    def __init__(
        self,
        ec,
        *,
        node_indim: int,
        edge_indim: int,
        h_dim=5,
        e_dim=4,
        h_outdim=2,
        hidden_dim=40,
        L_hc=3,
        alpha_hc: float = 0.5,
        **kwargs,
    ):
        """GraphTCN for the use with a pre-trained edge classifier

        Args:
            ec: Pre-trained edge classifier
            node_indim: Node feature dim. Determined by input data.
            edge_indim: Edge feature dim. Determined by input data.
            h_dim: node dimension after encoding
            e_dim: edge dimension after encoding
            h_outdim: output dimension in clustering space
            hidden_dim: dimension of hidden layers in all MLPs used in the interaction
                networks
            L_hc: message passing depth for track condenser
            alpha_hc: strength of residual connection for multi-layer interaction
                networks
        """
        super().__init__()
        hc_in = ResIN(
            node_dim=h_dim,
            edge_dim=e_dim,
            object_hidden_dim=hidden_dim,
            relational_hidden_dim=hidden_dim,
            alpha=alpha_hc,
            n_layers=L_hc,
        )
        self._gtcn = ModularGraphTCN(
            ec=ec,
            hc_in=hc_in,
            node_indim=node_indim,
            edge_indim=edge_indim,
            h_dim=h_dim,
            e_dim=e_dim,
            h_outdim=h_outdim,
            hidden_dim=hidden_dim,
            **kwargs,
        )

    def forward(
        self,
        data: Data,
    ) -> dict[str, Tensor]:
        return self._gtcn.forward(data=data)


In [6]:



# Stupid modification that just takes the globally defined EC
class PretrainedECTrainable(TCNTrainable):
    def get_loss_functions(self) -> dict[str, Any]:
        return {
            "potential": self.get_potential_loss_function(),
            # "background": self.get_background_loss_function(),
            # "edge": self.get_edge_loss_function(),
        }

    def get_trainer(self) -> TCNTrainer:
        trainer = super().get_trainer()
        trainer.ec_threshold = self.tc["m_ec_threshold"]
        return trainer

    def get_model(self) -> nn.Module:
        return PreTrainedECGraphTCN(
            ec,
            node_indim=7,
            edge_indim=4,
            **subdict_with_prefix_stripped(self.tc, "m_"),
        )
        # return ec

In [7]:

import optuna
from gnn_tracking_hpo.config import get_metadata, auto_suggest_if_not_fixed
from gnn_tracking_hpo.trainable import suggest_default_values


def suggest_config(
    trial: optuna.Trial,
    *,
    sector= None,
    ec_project: str,
    ec_hash: str,
    ec_epoch: int = -1,
    test=False,
    fixed= None,
) -> dict[str, Any]:
    config = get_metadata(test=test)
    config.update(fixed or {})

    def d(key, *args, **kwargs):
        auto_suggest_if_not_fixed(key, config, trial, *args, **kwargs)

    # Definitely Fixed hyperparameters
    # --------------------------------

    d("n_graphs_train", 247776)
    config["train_data_dir"] = [
        f"/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_{i}"
        for i in range(1, 9)
    ]
    d(
        "val_data_dir",
        "/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_9",
    )
    d("sector", sector)

    d("m_mask_orphan_nodes", True)
    d("use_ec_embeddings_for_hc", True)

    d("ec_project", ec_project)
    d("ec_hash", ec_hash)
    d("ec_epoch", ec_epoch)

    d("batch_size", 5)

    # Keep one fixed because of normalization invariance
    d("lw_potential_attractive", 1.0)

    d("m_hidden_dim", 120)
    d("m_h_dim", 120)
    d("m_e_dim", 120)

    # Most of the following parameters are fixed based on af5b5461

    d("attr_pt_thld", 0.6)
    d("q_min", 0.34)
    d("sb", 0.09)
    d("m_alpha_hc", 0.63)
    d("lw_background", 0.0041)
    d("lw_potential_repulsive", 0.16)
    d("repulsive_radius_threshold", 3.7)
    d("m_h_outdim", 7)

    # Tuned hyperparameters
    # ---------------------

    d("m_ec_threshold", 0.1, 0.5)
    d("lr", 0.0001, 0.0010)
    d("m_L_hc", 3, 5)

    suggest_default_values(config, trial, ec="fixed")
    print(config)
    print(trial.params)
    return config

In [8]:
study = optuna.create_study(direction="maximize")
trial = study.ask()
_config = suggest_config(trial=trial, ec_project=project, ec_hash=hash, ec_epoch=epoch, test=True)
config = _config | trial.params
config["batch_size"] = 1
trainable = PretrainedECTrainable(config)
trainer = trainable.trainer

[32m[I 2023-04-18 19:11:57,380][0m A new study created in memory with name: no-name-341f070d-4ed9-402d-a5a3-11e1255aadb2[0m
[32m[19:11:57 gnnt_hpo] INFO: I'm running on a node with job ID=47034685[0m
[32m[19:11:57 gnnt_hpo] INFO: The ID of my dispatcher is 0[0m
[36m[19:11:57 SlurmControl] DEBUG: Refreshing control config from /home/kl5675/ray_slurm_control.yaml[0m
[36m[19:11:57 gnnt_hpo] DEBUG: Got config
┌───────────────────────────────┬──────────────────────────────────────────┐
│ _val_batch_size               │ 1                                        │
│ adam_amsgrad                  │ False                                    │
│ adam_beta1                    │ 0.9                                      │
│ adam_beta2                    │ 0.999                                    │
│ adam_eps                      │ 1e-08                                    │
│ adam_weight_decay             │ 0.0                                      │
│ attr_pt_thld                  │ 0.6     

{'test': True, 'gnn_tracking_hash': 'd765620dde9582e8f229334f60f58bf182ff0e10', 'gnn_tracking_experiments_hash': '41dda2253ce999deb42d519115dd2e7bd0f3b83c', 'n_graphs_train': 1, 'train_data_dir': ['/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_1', '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_2', '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_3', '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_4', '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_5', '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_6', '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_7', '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_8'], 'val_data_dir': '/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_9', 'sector': None, 'm_mask_orphan_nodes': True, 'use_ec_embeddings_for_hc': True, 'ec_project': 'ec', 'ec_hash': '

[32m[19:12:00] INFO: DataLoader will load 1 graphs (out of 247776 available).[0m
[36m[19:12:00] DEBUG: First graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_1/data21000_s0.pt, last graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_1/data21000_s0.pt[0m
[32m[19:12:00] INFO: DataLoader will load 1 graphs (out of 32000 available).[0m
[36m[19:12:00] DEBUG: First graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_9/data29000_s0.pt, last graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/graphs_v1/part_9/data29000_s0.pt[0m
[36m[19:12:00] DEBUG: Parameters for data loader 'train': {'batch_size': 1, 'num_workers': 1, 'sampler': <torch.utils.data.sampler.RandomSampler object at 0x150e837218d0>, 'pin_memory': True}[0m
[36m[19:12:00] DEBUG: Parameters for data loader 'val': {'batch_size': 1, 'num_workers': 1, 'sampler': None, 'pin_memory': True}[0m
[36m[19:12:00] DEBUG: Parameters for 

In [9]:
import os
os.environ["PYTORCH_NVFUSER_DISABLE"] = ""
trainer.train_step()


  storage = elem.storage()._new_shared(numel)
[32m[19:12:02 TCNTrainer] INFO: Epoch  0 (    0/12000): potential_attractive_weighted=   0.00177, potential_repulsive_weighted=  34.21142[0m
[32m[19:12:02 TCNTrainer] INFO: Epoch  0 (   10/12000): potential_attractive_weighted=   0.02980, potential_repulsive_weighted=  29.64644[0m
[32m[19:12:03 TCNTrainer] INFO: Epoch  0 (   20/12000): potential_attractive_weighted=   0.35067, potential_repulsive_weighted=  20.73614[0m
[32m[19:12:03 TCNTrainer] INFO: Epoch  0 (   30/12000): potential_attractive_weighted=   1.69778, potential_repulsive_weighted=  11.59402[0m
[32m[19:12:03 TCNTrainer] INFO: Epoch  0 (   40/12000): potential_attractive_weighted=   3.89390, potential_repulsive_weighted=   8.58915[0m
[32m[19:12:03 TCNTrainer] INFO: Epoch  0 (   50/12000): potential_attractive_weighted=   4.05928, potential_repulsive_weighted=   8.62240[0m
[32m[19:12:03 TCNTrainer] INFO: Epoch  0 (   60/12000): potential_attractive_weighted=   2.9795

KeyboardInterrupt: 