In [1]:
from torch.utils.data import Dataset as TorchDataset
import h5py
import torch
from torch_geometric.data import Dataset as PygDataset, Data
from torch_geometric.loader import DataLoader
import numpy as np
import os.path as osp
import os
import multiprocessing as mp
import torch.nn.functional as F
from multiprocessing import Pool
from torch_geometric.nn import GATConv, Linear, TopKPooling, global_max_pool as gmp, global_mean_pool as gap
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from tqdm import tqdm
cpu_count = mp.cpu_count()


In [2]:
quark_gluon_path = "../Data/hdf5/processed/quark-gluon-dataset.hdf5"

In [3]:
def subset_dataset(raw_path, processed_path, subset_len = 6000, starter = 0):
    with h5py.File(raw_path, 'r') as f, h5py.File(processed_path, 'w') as p:
        keys = list(f.keys())
        total_events = f[keys[1]].shape[0]
        for key in keys:
            shape = (subset_len,)
            if len(f[key].shape) > 1:
                shape = (subset_len, 125, 125, 3)
            p.create_dataset(key, shape=shape)
        quark_count = 0
        gluon_count = 0
        idx = 0
        for i in range(starter, starter + subset_len):
            if quark_count < subset_len // 2:
                for key in keys:
                    p[key][idx] = f[key][i]
                quark_count += 1
                idx += 1
            elif gluon_count < subset_len // 2:
                for key in keys:
                    p[key][idx] = f[key][i]
                gluon_count += 1
                idx+=1


In [4]:
train_path = "../Data/hdf5/processed/train.hdf5"
val_path = "../Data/hdf5/processed/val.hdf5"
test_path = "../Data/hdf5/processed/test.hdf5"

In [5]:
subset_dataset(quark_gluon_path, train_path, 6000)
subset_dataset(quark_gluon_path, val_path, 1200, 6000)
subset_dataset(quark_gluon_path, test_path, 1200, 7200)

In [6]:
def get_pillow(x):
    return x.transpose((2,1,0))
def get_k_nearest(indices, k = 10):
    edges = None
    for i in range(indices.shape[0]):
        k_nearest = np.sum((indices - indices[i])**2, axis=1).argsort()
        k_nearest_edges = np.array([[i, j] for j in k_nearest[1:k]])
        if edges is None:
            edges = k_nearest_edges
        else:
            edges = np.vstack((edges, k_nearest_edges))
    return edges
def create_graph(idx,quark_gluon_path ,outpath ):
    data = Data()
    with h5py.File(quark_gluon_path, 'r') as f:
        y = f['y'][idx]
        x = f['X_jets'][idx]
        non_zero_indices = np.argwhere(np.sum(x, axis=2))
        non_zero_fetures = x[non_zero_indices[:, 0], non_zero_indices[:, 1]]
        data.x = torch.from_numpy(non_zero_fetures)
        edges = get_k_nearest(non_zero_indices)
        data.edge_index = torch.from_numpy(edges).t().contiguous()
        data.y = torch.from_numpy(np.asarray([y]))
        data.pos = torch.from_numpy(non_zero_indices)
        torch.save(data, osp.join(outpath, f"{idx}.pt"))

In [13]:
def grapher(root_dir = "../Data/hdf5/processed"):
    files = ["train.hdf5", "val.hdf5", "test.hdf5"]
    for file in files:
        path = osp.join(root_dir , file)
        with h5py.File(path, 'r') as f:
            event_count = len(f["X_jets"])
        data = file.split(".")[0]
        if len(os.listdir("../Data/Graphs/{}/raw".format(data))) < 1:
            for i in range(event_count):
                print(data)
                create_graph(i, path , "../Data/Graphs/{}/raw".format(data))

In [14]:
grapher()

In [15]:
class QuarkGluonGraphs(PygDataset):
    def __init__(self, root = None, transform = None, pre_transform = None, pre_filter = None, log = True):
        super().__init__(root, transform, pre_transform, pre_filter, log)
        
    @property
    def raw_file_names(self):
        return os.listdir(osp.join(self.root, "raw"))
    
    @property
    def processed_file_names(self):
        return os.listdir(osp.join(self.root, "raw"))
    def download(self):
        pass

    def process(self):
        for raw_path in self.raw_file_names:
            data = torch.load(osp.join(self.raw_dir, raw_path))
            data.y = F.one_hot(data.y.to(torch.int64), 2)
            torch.save(data, osp.join(self.processed_dir, raw_path))
    def len(self):
        return len(self.processed_file_names)
    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, f"{idx}.pt"))
        if self.transform is not None:
            data = self.transform(data)
        return data


In [16]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
])
train_data = QuarkGluonGraphs("../Data/Graphs/train/", transform=transform)
val_data = QuarkGluonGraphs("../Data/Graphs/val/", transform=transform)
test_data = QuarkGluonGraphs("../Data/Graphs/test/", transform=transform)

In [17]:
class GCN(torch.nn.Module):
    def __init__(self, feature_size):
        super().__init__()
        num_classes = 2
        embedding_size = 1024

        # GNN Layers
        self.conv1 = GATConv(feature_size, embedding_size, heads=3, dropout=0.3)
        self.head_transform1 = Linear(embedding_size*3, embedding_size)
        self.pool1 = TopKPooling(embedding_size, ratio=0.8)
        self.conv2 = GATConv(embedding_size, embedding_size, heads=3, dropout=0.3)
        self.head_transform2 = Linear(embedding_size*3, embedding_size)
        self.pool2 = TopKPooling(embedding_size, ratio=0.5)
        self.conv3 = GATConv(embedding_size, embedding_size, heads=3, dropout=0.3)
        self.head_transform3 = Linear(embedding_size*3, embedding_size)
        self.pool3 = TopKPooling(embedding_size, ratio=0.2)

        # Linear Layers
        self.linear1 = Linear(embedding_size*2, embedding_size)
        self.linear2 = Linear(embedding_size, num_classes)

    def forward(self, x, edge_index, batch_index):
        # first block
        x = self.conv1(x, edge_index)
        x = self.head_transform1(x)

        x, edge_index, _, batch_index, _, _ = self.pool1(x, edge_index, None, batch_index) 
        x1 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)

        # second block
        x = self.conv2(x, edge_index)
        x = self.head_transform2(x)

        x, edge_index, _, batch_index, _, _ = self.pool2(x, edge_index, None, batch_index) 
        x2 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)

        # Third block
        x = self.conv3(x, edge_index)
        x = self.head_transform3(x)

        x, edge_index, _, batch_index, _, _ = self.pool3(x, edge_index, None, batch_index) 
        x3 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)

        # concat pooled vectors
        x = x1 + x2 + x3

        # output block
        x = self.linear1(x).relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.linear2(x)
        # x = x.type(torch.float)
        return x

In [18]:
model = GCN(feature_size=train_data[0].x.shape[1])

In [19]:
model = model.to(device)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [21]:
count_parameters(model)

17871874

In [22]:
model

GCN(
  (conv1): GATConv(3, 1024, heads=3)
  (head_transform1): Linear(3072, 1024, bias=True)
  (pool1): TopKPooling(1024, ratio=0.8, multiplier=1.0)
  (conv2): GATConv(1024, 1024, heads=3)
  (head_transform2): Linear(3072, 1024, bias=True)
  (pool2): TopKPooling(1024, ratio=0.5, multiplier=1.0)
  (conv3): GATConv(1024, 1024, heads=3)
  (head_transform3): Linear(3072, 1024, bias=True)
  (pool3): TopKPooling(1024, ratio=0.2, multiplier=1.0)
  (linear1): Linear(2048, 1024, bias=True)
  (linear2): Linear(1024, 2, bias=True)
)

In [23]:
# weights = torch.tensor([0, 1], dtype=torch.float32).to(device)
loss_fn = torch.nn.CrossEntropyLoss()#(weight=weights)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

In [24]:
NUM_GRAPHS_PER_BATCH = 16
train_loader = DataLoader(train_data,
                          batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True
                          )
test_loader = DataLoader(test_data,
                         batch_size=NUM_GRAPHS_PER_BATCH,
                         )

In [25]:
def train(epochs, model, train_loader, loss_fn):
    # Enumerate over the data
    all_preds = []
    all_labels = []
    for _, batch in enumerate(tqdm(train_loader)):
        # Reset gradients
        optimizer.zero_grad()
        # passing the node features and the connection info
        pred = model(batch.x,
                     batch.edge_index,
                     batch.batch
                     )
        # Calculate the loss and the gradient
        loss = torch.sqrt(loss_fn(pred, batch.y.float()))
        loss.backward()
        # Update using the gradients
        optimizer.step()

        all_preds.append(np.argmax(pred.cpu().detach().numpy(), axis=1))
        all_labels.append(batch.y.cpu().detach().numpy())
    all_preds = np.concatenate(all_preds).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    return loss

def test(epoch, model, test_loader, loss_fn):
    all_preds = []
    all_preds_raw = []
    all_labels = []
    running_loss = 0.0
    step = 0
    for batch in test_loader:
        batch.to(device)  
        pred = model(batch.x, 
                        batch.edge_index, 
                        batch.batch) 
        loss = torch.sqrt(loss_fn)

         # Update tracking
        running_loss += loss.item()
        step += 1
        all_preds.append(np.rint(torch.sigmoid(pred).cpu().detach().numpy()))
        all_preds_raw.append(torch.sigmoid(pred).cpu().detach().numpy())
        all_labels.append(batch.y.cpu().detach().numpy())
    
    all_preds = np.concatenate(all_preds).ravel()
    all_labels = np.concatenate(all_labels).ravel()
    print(all_preds_raw[0][:10])
    print(all_preds[:10])
    print(all_labels[:10])
    return running_loss

In [26]:
for epoch in range(10):
    model.train()
    loss = train(epoch, model, train_loader, loss_fn)
    loss = loss.detach().cpu().numpy()
    print(" Epoch {} | training loss {}".format(epoch, loss))
    with torch.no_grad():
        running_loss = test(epoch, model, test_loader, loss_fn)
    print(" Epoch {} | testing loss {}".format(epoch, running_loss))
    scheduler.step()

 35%|███▍      | 131/375 [13:37<25:22,  6.24s/it]


KeyboardInterrupt: 