In [1]:
!nvidia-smi

Tue Aug 15 13:28:41 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.10              Driver Version: 535.86.10    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000000:CA:00.0 Off |                    0 |
| N/A   38C    P0              57W / 300W |      2MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [2]:
import os
from pathlib import Path
from functools import partial

import numpy as np
from typing import Any
from sklearn.metrics import accuracy_score, recall_score

import torch.nn
from pytorch_lightning.core.mixins import HyperparametersMixin
from torch import Tensor as T
from torch import nn
from torch.nn.functional import normalize
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch_geometric.data import Data
from pytorch_lightning import Trainer

from pytorch_lightning.callbacks import Callback, TQDMProgressBar

from gnn_tracking.utils.loading import TrackingDataModule
from gnn_tracking.training.callbacks import PrintValidationMetrics
from gnn_tracking.training.base import TrackingModule
from gnn_tracking.models.graph_construction import GraphConstructionFCNN
from gnn_tracking.utils.lightning import obj_from_or_to_hparams
from gnn_tracking.models.mlp import MLP

In [3]:
class NoiseModel(nn.Module, HyperparametersMixin):
    def __init__(self, in_dim, depth, hidden_dim):
        super().__init__()
        self.save_hyperparameters()

        layers = [nn.Linear(in_dim, hidden_dim), nn.ReLU()]
        for _ in range(depth - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_dim, 1))  # Output layer
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)

In [4]:
class NoiseModule(TrackingModule):
    def __init__(
        self,
        *,
        loss_fct: BCEWithLogitsLoss,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.loss_fct = loss_fct
    
    def get_losses(self, out: Any, true_hits: T) -> T:
        return self.loss_fct(out, true_hits)

    def sigmoid_and_threshold(self, logits, threshold=0.5):
        probs = torch.sigmoid(logits)
        return (probs >= threshold).float()
    
    def get_true_hits(self, data: Data):
        true_hits = (data.particle_id != 0) & (data.pt >= 0.5)
        true_hits = true_hits.unsqueeze(dim=-1).type(torch.float64)
        return true_hits
    
    def training_step(self, batch: Data, batch_idx: int) -> T | None:
        out = self(batch.x)
        true_hits = self.get_true_hits(batch)
        loss = self.get_losses(out, true_hits)
        
        self.log_dict(
            {'BCELoss_train': float(loss)},
            prog_bar=True,
            on_step=True,
            batch_size=self.trainer.train_dataloader.batch_size,
        )
        
        return loss
    
    def validation_step(self, batch: Data, batch_idx: int):
        out = self(batch.x)
        true_hits = self.get_true_hits(batch)
        loss = self.get_losses(out, true_hits)
        
        self.log_dict_with_errors(
           {'BCELoss': float(loss)}, batch_size=self.trainer.val_dataloaders.batch_size
        )
        
        preds = self.sigmoid_and_threshold(out)
        true_labels = true_hits.cpu().numpy()
        predicted_labels = preds.cpu().numpy()

        accuracy = accuracy_score(true_labels, predicted_labels)
        recall = recall_score(true_labels, predicted_labels)

        self.log('val_accuracy', accuracy, on_step=False, on_epoch=True)
        self.log('val_recall', recall, on_step=False, on_epoch=True)

In [5]:
val_dir = Path("/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/part_9")
assert val_dir.is_dir()

In [6]:
train_dirs = []
train_dir = Path("/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/")
for i in range(1,9):
    d = os.path.join(train_dir, f"part_{i}")
    if os.path.isdir(d):
        train_dirs.append(d)

In [7]:
dm = TrackingDataModule(
    train=dict(
        dirs=train_dirs,
        batch_size=1
    ),
    val=dict(
        dirs=[val_dir],
        stop=10,
    ),
    # could also configure a 'test' set here
)

In [8]:
# This is called by the Trainer automatically and sets up the datasets
dm.setup(stage="fit")  # 'fit' combines 'train' and 'val'
# Now the datasets are available:
dm.datasets

[32m[13:28:46] INFO: DataLoader will load 7743 graphs (out of 7743 available).[0m
[36m[13:28:46] DEBUG: First graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/part_1/data21000_s0.pt, last graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/part_8/data28999_s0.pt[0m
[32m[13:28:46] INFO: DataLoader will load 10 graphs (out of 1000 available).[0m
[36m[13:28:46] DEBUG: First graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/part_9/data29000_s0.pt, last graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/part_9/data29009_s0.pt[0m


{'train': TrackingDataset(7743), 'val': TrackingDataset(10)}

In [9]:
model =  NoiseModel(in_dim=14, depth=6, hidden_dim=256)
nmodel = NoiseModule(
        model=model,
        loss_fct=BCEWithLogitsLoss(),
        optimizer=partial(torch.optim.Adam, lr=0.001),
    )

In [10]:
max_epochs = 10

trainer = Trainer(max_epochs=max_epochs, accelerator="gpu",
                  log_every_n_steps=1, callbacks=[PrintValidationMetrics(),
                                                  TQDMProgressBar(refresh_rate=5)])

trainer.fit(model=nmodel, datamodule=dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[32m[13:28:46] INFO: DataLoader will load 7743 graphs (out of 7743 available).[0m
[36m[13:28:46] DEBUG: First graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/part_1/data21000_s0.pt, last graph is /scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v6/part_8/data28999_s0.pt[0m
[32m[13:28:46] INFO: DataLoader will load 10 graphs (out of 1000 available).[0m
[36m[13:28:46] DEBUG: First graph is /scratch/gpfs/IOJALVO/gnn-tracking/ob

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



[3m         Validation epoch=0          [0m
┏━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1mMetric       [0m[1m [0m┃[1m [0m[1m  Value[0m[1m [0m┃[1m [0m[1m  Error[0m[1m [0m┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
│ BCELoss       │ 0.66634 │ 0.00080 │
│ BCELoss_train │ 0.67405 │     nan │
│ val_accuracy  │ 0.58717 │     nan │
│ val_recall    │ 0.00610 │     nan │
└───────────────┴─────────┴─────────┘



Validation: 0it [00:00, ?it/s]

[3m         Validation epoch=1          [0m
┏━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1mMetric       [0m[1m [0m┃[1m [0m[1m  Value[0m[1m [0m┃[1m [0m[1m  Error[0m[1m [0m┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
│ BCELoss       │ 0.64683 │ 0.00093 │
│ BCELoss_train │ 0.65176 │     nan │
│ val_accuracy  │ 0.59152 │     nan │
│ val_recall    │ 0.10513 │     nan │
└───────────────┴─────────┴─────────┘



Validation: 0it [00:00, ?it/s]

[3m         Validation epoch=2          [0m
┏━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1mMetric       [0m[1m [0m┃[1m [0m[1m  Value[0m[1m [0m┃[1m [0m[1m  Error[0m[1m [0m┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
│ BCELoss       │ 0.64320 │ 0.00103 │
│ BCELoss_train │ 0.63469 │     nan │
│ val_accuracy  │ 0.58969 │     nan │
│ val_recall    │ 0.05274 │     nan │
└───────────────┴─────────┴─────────┘



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

[3m         Validation epoch=8          [0m
┏━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1mMetric       [0m[1m [0m┃[1m [0m[1m  Value[0m[1m [0m┃[1m [0m[1m  Error[0m[1m [0m┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
│ BCELoss       │ 0.63778 │ 0.00098 │
│ BCELoss_train │ 0.63003 │     nan │
│ val_accuracy  │ 0.59346 │     nan │
│ val_recall    │ 0.10559 │     nan │
└───────────────┴─────────┴─────────┘



Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


[3m         Validation epoch=9          [0m
┏━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1mMetric       [0m[1m [0m┃[1m [0m[1m  Value[0m[1m [0m┃[1m [0m[1m  Error[0m[1m [0m┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
│ BCELoss       │ 0.63744 │ 0.00099 │
│ BCELoss_train │ 0.63558 │     nan │
│ val_accuracy  │ 0.59334 │     nan │
│ val_recall    │ 0.10821 │     nan │
└───────────────┴─────────┴─────────┘

