In [1]:
# TODO: explore EM shower profile
import torch
import numpy as np
from utils.data_utils import HGCALTracksters
from torch_geometric.loader import DataLoader

In [2]:
import torch_geometric.transforms as T
transform = T.Compose([T.NormalizeFeatures()])

ds = HGCALTracksters("data", kind="photon", transform=transform)

In [3]:
ds

HGCALTracksters(23673)

In [24]:
# balance the dataset
pos = ds[ds.data.y == 1]
neg = ds[ds.data.y == 0]
print(len(pos), len(neg))
len_neg = len(neg)
len_pos = len(pos)
shorter = min(len_neg, len_pos)

test_set_class_size = 1000
train_set = pos[:shorter - test_set_class_size] + neg[:shorter - test_set_class_size]
test_set = pos[shorter - test_set_class_size:shorter] + neg[shorter - test_set_class_size:shorter]
print(len(train_set), len(test_set))
train_dl = DataLoader(train_set, batch_size=8, shuffle=True)
test_dl = DataLoader(test_set, batch_size=8, shuffle=True)

16573 7100
12200 2000


In [25]:
print(f"PyTorch version: {torch.__version__}")
# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
# print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
# print(f"Is MPS available? {torch.backends.mps.is_available()}")
# device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else "cpu")

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

PyTorch version: 1.13.0.dev20220622
Using device: cpu


In [26]:
from torch_geometric.nn import GCNConv, global_mean_pool

class TracksterClassifier(torch.nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(TracksterClassifier, self).__init__(**kwargs)

        self.conv1 = GCNConv(in_channels, out_channels)
        self.conv2 = GCNConv(out_channels, out_channels)
        self.dense = torch.nn.Linear(out_channels, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)
        x = self.dense(x)
        return torch.sigmoid(x)

In [27]:
loss_obj = torch.nn.BCELoss()

def train(model, loader):
    epoch_loss = 0
    for batch in loader:
        model.train()
        batch = batch.to(device)
        optimizer.zero_grad()
        z = model(batch).reshape(-1)
        loss = loss_obj(z, batch.y.type(torch.float))
        epoch_loss += loss
        loss.backward()
        optimizer.step()
    return float(epoch_loss)

@torch.no_grad()
def test(model, data):
    total = 0
    correct = 0
    for batch in data:
        model.eval()
        prediction = (model(batch).reshape(-1) > 0.5).type(torch.int)
        total += len(prediction) 
        correct += sum(prediction == batch.y)
    return correct / total

In [29]:
model = TracksterClassifier(ds.num_node_features, 128)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(101):
    loss = train(model, train_dl)
    train_acc = test(model, train_dl)
    test_acc = test(model, test_dl)
    if epoch % 5 == 0:
        print(f'Epoch: {epoch}, loss: {loss:.4f}, train acc: {train_acc:.4f}, test acc: {test_acc:.4f}')

Epoch: 0, loss: 939.2356, train acc: 0.7534, test acc: 0.7530
Epoch: 5, loss: 564.5643, train acc: 0.8278, test acc: 0.8205
Epoch: 10, loss: 547.2329, train acc: 0.8268, test acc: 0.8240
Epoch: 15, loss: 535.6576, train acc: 0.8284, test acc: 0.8210
Epoch: 20, loss: 539.4481, train acc: 0.8252, test acc: 0.8205
Epoch: 25, loss: 530.6376, train acc: 0.8534, test acc: 0.8415
Epoch: 30, loss: 525.0104, train acc: 0.8234, test acc: 0.8215
Epoch: 35, loss: 529.8818, train acc: 0.8562, test acc: 0.8485
Epoch: 40, loss: 530.4089, train acc: 0.8425, test acc: 0.8275
Epoch: 45, loss: 524.0376, train acc: 0.8520, test acc: 0.8430
Epoch: 50, loss: 521.8866, train acc: 0.8443, test acc: 0.8420
Epoch: 55, loss: 524.4149, train acc: 0.8599, test acc: 0.8410
Epoch: 60, loss: 525.6693, train acc: 0.8476, test acc: 0.8335
Epoch: 65, loss: 522.4274, train acc: 0.8527, test acc: 0.8355
Epoch: 70, loss: 521.7656, train acc: 0.8536, test acc: 0.8445
Epoch: 75, loss: 515.0178, train acc: 0.8593, test acc: 0

In [None]:
import matplotlib.pyplot as plt

In [None]:
def check_accuracy(predictions, loader):
    energy = []
    for b in loader:
        energy.extend([en for en in b.energy])
    e_nvertices = np.array([len(x) for i, x in enumerate(energy)])
    e_sum = np.array([sum(x) for i, x in enumerate(energy)])

    plt.figure(figsize=(10, 8))
    mask = [bool(pred) for pred in predictions]
    plt.scatter(e_nvertices[mask], e_sum[mask], c = 'dodgerblue', label = 'correct') 
    acc = len(e_nvertices[mask])
    mask = [not bool(pred) for pred in predictions]
    plt.scatter(e_nvertices[mask], e_sum[mask], c = 'red', label = 'incorrect') 
    print(f'accuracy: {(acc / (acc + len(e_nvertices[mask]))):.4f}')
    plt.title("Number of vertices vs total energy")
    plt.xlabel("Number of vertices")
    plt.ylabel("Total energy")
    plt.legend()
    return