In [1]:
import os
from datetime import datetime
import time
from IPython.display import clear_output

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch_geometric.loader.dataloader import DataLoader
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

from torch_geometric.data import Data
import torch_geometric
import networkx as nx


from sklearn.model_selection import train_test_split

from ClusterDataset import ClusterDataset as GNNDataset
from ClusterDatasetTransformer import ClusterDataset
from train_transformer import *
from data_statistics import *
from GNN_TrackLinkingNet import EarlyStopping

from IPython.display import display

from Transformer import Transformer
from lang import Lang
from LossFunction import Loss

2025-05-30 14:48:10.306183: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-30 14:48:10.327116: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748609290.351747    2033 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748609290.359268    2033 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-30 14:48:10.385905: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
# CUDA Setup
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


In [3]:
input_length = 60
max_seq_length = 60
batch_size = 64
converter = Lang(0)

In [4]:
# Load the dataset
model_folder = "/eos/user/c/czeh/"
hist_folder = "/eos/user/c/czeh/histo_10pion0PU/"
data_folder_training = "/eos/user/c/czeh/graph_data/processed"
store_folder_training = "/eos/user/c/czeh/graph_data_trans"
data_folder_test = "/eos/user/c/czeh/graph_data_test/processed"
store_folder_test = "/eos/user/c/czeh/graph_data_trans_test"

dataset_training = ClusterDataset(store_folder_training, data_folder_training, input_length=input_length, output_group=False)
dataset_test = ClusterDataset(store_folder_test, data_folder_test, input_length=input_length, output_group=False)

Done
Done


In [5]:
train_dl = DataLoader(dataset_training, shuffle=True, batch_size=batch_size)
test_dl = DataLoader(dataset_test, shuffle=True, batch_size=batch_size)

In [6]:
epochs = 100
d_model = 128
num_heads = 4
num_layers = 6
d_ff = 256
dropout = 0.2
padding = converter.word2index["<PAD>"]
feature_num = len(dataset_test.model_feature_keys)
max_nodes = max(dataset_test.max_nodes, dataset_test.max_nodes)
vocab_size = max_nodes + 4

# Model, loss, and optimizer
model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, feature_num, max_nodes, max_seq_length, dropout).to(device)
criterion = Loss(converter)

In [None]:
# Optionally introduce weight decay
# optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

# Drop Step Size over time
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
early_stopping = EarlyStopping(patience=5, delta=-0.1)

In [None]:
#Load Weights if needed
# weights = torch.load("/eos/user/c/czeh/tranformer_2.pt", weights_only=True)
# model.load_state_dict(weights["model_state_dict"])
# optimizer.load_state_dict(weights["optimizer_state_dict"])
# start_epoch = weights["epoch"]

In [None]:
train_loss_hist = []
val_loss_hist = []

In [None]:
# https://stats.stackexchange.com/questions/352036/what-should-i-do-when-my-neural-network-doesnt-learn
# Optionally introduce gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.25)

fig_loss, ax_loss = plt.subplots(1, 1)
fig_loss.set_figwidth(6)
fig_loss.set_figheight(3)

display_loss = display(1, display_id=True)

optimizer.zero_grad()

# Training loop
for epoch in range(1, 101):
    print(f'Epoch: {epoch}')
    
    loss = train(model, optimizer, train_dl, epoch, criterion, vocab_size, device=device)
    print(f"Training loss: {loss}")
    train_loss_hist.append(loss)
    
    val_loss = test(model, test_dl, epoch, criterion, vocab_size, device=device)
    val_loss_hist.append(val_loss)
    print(f"Validation loss: {val_loss}")
    
    ax_loss.clear()
    plot_loss(train_loss_hist, val_loss_hist, ax=ax_loss, n=1)
    display_loss.update(fig_loss)
    time.sleep(1)
    
    scheduler.step()
    print(f"Epoch {epoch+1}, LR: {scheduler.get_last_lr()[0]}")
    
    early_stopping(model, val_loss)
    if early_stopping.early_stop:
        print(f"Early stopping after {epoch+1} epochs")
        early_stopping.load_best_model(model)
        break

In [None]:
fig, ax = plt.subplots(1, 1)
fig.set_figheight(6)
fig.set_figwidth(10)
epochs = len(train_loss_hist)
ax.plot(range(1, epochs+1), moving_average(train_loss_hist, 8), label='train', linewidth=2)
ax.plot(range(1, epochs+1), moving_average(val_loss_hist, 8), label='val', linewidth=2)
ax.set_ylabel("Loss", fontsize=14)
ax.set_xlabel("Epochs", fontsize=14)
ax.set_title("Training and Validation Loss", fontsize=14)
ax.legend()

In [None]:
date = f"{datetime.now():%Y-%m-%d}"
save_model(model, epoch, optimizer, train_loss_hist, val_loss_hist, model_folder, f"tranformer_date_{date}.pt")

## Test Full Event

In [7]:
from EventGrouping import EventGrouping

In [8]:
model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, feature_num, max_nodes, max_seq_length, dropout).to(device)
weights = torch.load("/eos/user/c/czeh/tranformer_date_2025-05-30.pt", weights_only=True)
model.load_state_dict(weights["model_state_dict"])

<All keys matched successfully>

In [9]:
components = dataset_test.get(0)
components[0]["lang"]

array([114, 118, 125, 126, 127, 132, 134, 144, 149, 151, 152, 121, 145,
       160, 120, 117, 191, 139, 142, 146])

In [10]:
runner = EventGrouping(converter, model, neighborhood=1, seq_length=input_length)
runner(components[0])

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  1, 10], device='cuda:0')
127
127
127
125
114
114
114
160
118
160
118
117
114
160
118
152
152
152
152
152
134


[tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  1, 10,  8], device='cuda:0'),
 tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  1, 10,  8,  8], device='cuda:0'),
 tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  1, 10,  8,  8,  8], device='cuda:0'),
 tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  

In [None]:
model_feature_keys = np.array([0,  2,  3,  4,  6,  7, 10, 14, 15, 16, 17, 18, 22, 24, 25, 26, 28, 29])
dataset_training.__getitem__(0)[0][:, model_feature_keys][:, -1]

In [None]:
dataset_test.get(0).cluster

In [None]:
dataset_training.node_feature_keys[16]

In [None]:
G = torch_geometric.utils.to_networkx(dataset_test.get(0), to_undirected=True)

In [None]:
dataset_training.get(0).x[:, 16]

In [None]:
fig, ax = plt.subplots(1, 1)
nx.draw(G, with_labels=True, ax=ax)

## Random Tests

In [7]:
model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, feature_num, max_nodes, max_seq_length, dropout).to(device)
weights = torch.load("/eos/user/c/czeh/tranformer_date_2025-05-30.pt", weights_only=True)
model.load_state_dict(weights["model_state_dict"])

<All keys matched successfully>

In [8]:
components = dataset_test.get(0)
components[0]["lang"]

array([114, 118, 125, 126, 127, 132, 134, 144, 149, 151, 152, 121, 145,
       160, 120, 117, 191, 139, 142, 146])

In [9]:
vocab_size

100

In [10]:
num_nodes = components[0]["nTrackster"]
converter = Lang(trackster_list=components[0]["lang"])
sample_seq = converter.starting_seq(components[0]["root"], input_length).to(device)
print(sample_seq)

X = components[0]["x"].float()
X = F.pad(X, pad=(0, 0, max_nodes - num_nodes, 0), value=converter.word2index["<PAD>"])
X = X[:, list(map(dataset_test.node_feature_dict.get, dataset_test.model_feature_keys))]

predictions = model(torch.unsqueeze(X, dim=0), torch.unsqueeze(sample_seq, dim=0))
torch.argsort(predictions[0, -1, :num_nodes], dim=0)[0].item()

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  1, 10], device='cuda:0')


18

In [None]:
targets[targets[:, -1] != -4, :]

In [None]:
targets[mask].shape[0]/3

In [None]:
opts = dataset_training.__getitem__(0)[1]
opts = torch.roll(opts, -1, dims=0)
opts[-1] = 5
opts

In [None]:
out_mask = opts != -4
opts[out_mask].shape[0]

In [None]:
targets = torch.reshape(targets[mask], (int(targets[mask].shape[0]/3), 3))

In [None]:
targets[0, :]