In [1]:
import os
import sys
os.getcwd()

'c:\\Users\\ebaca\\Desktop\\Phys 417\\Final Project - HEP Tagging'

In [2]:
# importing libraries & making torch.device object for GPU

# neural network packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

from torch import Tensor
from torch.nn import Transformer
from torch.utils.data import Dataset, DataLoader, TensorDataset, Subset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pack_sequence, pad_sequence
from torch.utils.data.sampler import SubsetRandomSampler

# data packages
import numpy as np
import math
import pandas as pd
import sklearn.preprocessing as prep
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import fndict as fd

# visual packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tqdm
import warnings
import pprint as pp

# Create a torch.device object to tell pytorch where to store your tensors: cpu or gpu
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [156]:
onehots = {
    (1, 0, 0, 0, 0): ['Gluon', 0],
    (0, 1, 0, 0, 0): ['Light Quark', 0],
    (0, 0, 1, 0, 0): ['W Boson', 0],
    (0, 0, 0, 1, 0): ['Z Boson', 0],
    (0, 0, 0, 0, 1): ['Top Quark', 0]
}
# print(list(onehots.values()))

files = shuffle([f for f in os.listdir('../../PHYS417_Project/data')])

for i, f in enumerate(files):
    with np.load('../../PHYS417_Project/data/' + f) as data_load:
        y = data_load['y']

        if tuple(y) in onehots:
            onehots[tuple(y)][1] += 1

pp.pprint(onehots)

{(0, 0, 0, 0, 1): ['Top Quark', 4020],
 (0, 0, 0, 1, 0): ['Z Boson', 4053],
 (0, 0, 1, 0, 0): ['W Boson', 4150],
 (0, 1, 0, 0, 0): ['Light Quark', 3835],
 (1, 0, 0, 0, 0): ['Gluon', 3942]}


In [157]:
# Define your model architecture
class JetClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(JetClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# custom PyTorch dataset (for DataLoader)
class JetDataset(Dataset):
    def __init__(self, files):
        self.files = files

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        with np.load(file_path) as data_load:
            x_load = torch.tensor(data_load['x'], dtype=torch.float32)
            y_load = torch.tensor(data_load['y'], dtype=torch.float32)
        return x_load, y_load

# the collate function lets the model handle variable-length sequences
def collate_fn(batch):
    # sorting batch in descending order of # of constituents
    # batch.sort(key=lambda x: x[0].size(0), reverse=True)   # option to sort by sequence length
    inputs, labels = zip(*batch)
    # adding padding to the sequences
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    # labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    labels = torch.stack(labels)  # Ensure labels are stacked without padding
    return inputs_padded, labels

# function to extract numpy arrays from tensors
extractor = lambda x: x.cpu().detach().numpy()



'''Extracting Files ----------------------------------------------------------'''

files = shuffle([os.path.join('../../PHYS417_Project/data/'+ f) for f in os.listdir('../../PHYS417_Project/data')])

print(f'First 70% of files: {len(files[:int(0.7*len(files))])} \nLast 30% of files: {len(files[int(0.7*len(files)):])}')



'''Splitting Files ----------------------------------------------------------'''

# splitting into training/testing sets
train_files = files[:int(0.7*len(files))]
test_files = files[int(0.7*len(files)):]

print(f"{len(train_files)} Training Files  + {len(test_files)} Testing Files  =  {len(files)} Total Files\n")
print(f'First 5 training files: {train_files[:5]} \nFirst 5 testing files: {test_files[:5]}\n')



'''Converting to Dataset Objects --------------------------------------------------------'''

train_dataset = JetDataset(train_files)
test_dataset = JetDataset(test_files)

# train_dataset[event n][input/label][constituent n, property n]
print(f'TrSet (event 0): {train_dataset[0][0].shape} \n{train_dataset[0][0][:1]} \nOnehot vector: {train_dataset[0][1]}\n')
# print(f'TeSet First const. (event 0): \n{test_dataset[0][0][:1]} \nOnehot vector: {test_dataset[0][1]}\n')



'''Creating DataLoaders/Applying Collate Fn (Padding) --------------------------------------------------------'''
# make datasets into DataLoader objects and apply collate_fn
train_loader = DataLoader(train_dataset, batch_size=13, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=13, shuffle=False, collate_fn=collate_fn)

# inspecting how padding went in collate_fn
# loader[input or label][batch n][event e][constituent n, property n]
n=0
for nbatch_x, nbatch_y in train_loader: # loops through n batches or loader[input/label][batch n]
    # inputs = array containing each events' data['x'] in the batch
    # labels = array containing each events' data['y'] in the batch
    print(f'\n --------------------------------------- [Batch {n}] --------------------------------------- \n')
    print(f'Input shape: {nbatch_x.shape}, Label shape: {nbatch_y.shape}\n') # general shape of the batch

    e=0
    # printing first 5 events in the batch
    for (properties, tag) in zip(nbatch_x[:2], nbatch_y[:2]):
        # printing the first 5 constituents/properties
        print(f'--Event {n}.{e}--',
              f'\n{extractor(properties[:1])} \n{extractor(properties[-1:])}, \nTag: {extractor(tag)}\n')
        e+=1
    
    # to stop after some number of events
    n+=1
    if n == 1: 
        break



'''
Initializing Model & Hyperparameters ----------------------------------------------------------
'''

# Initialize the model
input_size = 5  # Number of features for each constituent
hidden_size = 1280
num_layers = 1
num_classes = 5  # Number of categories for jet classification

model = JetClassifier(input_size, hidden_size, num_layers, num_classes).to(DEVICE)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss().to(DEVICE)  # Binary Cross Entropy Loss for multi-label classification
# criterion = nn.CrossEntropyLoss().to(DEVICE)  # includes softmax layer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



''' 
Training -------------------------------------------------------------------
'''
# [batch size][max sequence length][input size/number of features]

num_epochs = 10
for epoch in range(num_epochs):
    for events, tags in train_loader:
        events, onehots = events.to(DEVICE), tags.to(DEVICE)
        # tags = torch.argmax(onehots, dim=1) # Convert one-hot encoded labels to single integer labels

        optimizer.zero_grad()
        outputs = model(events)

        # loss = criterion(outputs, tags)
        loss = criterion(outputs, onehots)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    print(f'Inputs: {events.shape}  \nOnehot: {extractor(onehots[:1][0])}  ArgMax Index: {extractor(onehots[:1].argmax(dim=1))}')
    softmax = nn.Softmax(dim=1)
    probabilities = softmax(outputs)
    print(f'Outputs: {extractor(outputs[:1][0])}')
    print(f'Probabilities: {extractor(probabilities[:1][0])}')
    print('\n')


First 70% of files: 14000 
Last 30% of files: 6000
14000 Training Files  + 6000 Testing Files  =  20000 Total Files

First 5 training files: ['../../PHYS417_Project/data/event_0435010.npz', '../../PHYS417_Project/data/event_0198405.npz', '../../PHYS417_Project/data/event_0873594.npz', '../../PHYS417_Project/data/event_0740501.npz', '../../PHYS417_Project/data/event_0359356.npz'] 
First 5 testing files: ['../../PHYS417_Project/data/event_0229432.npz', '../../PHYS417_Project/data/event_0108115.npz', '../../PHYS417_Project/data/event_0687919.npz', '../../PHYS417_Project/data/event_0956043.npz', '../../PHYS417_Project/data/event_0662343.npz']

TrSet (event 0): torch.Size([40, 5]) 
tensor([[0.2562, 0.0000, 0.0000, 0.2555, 0.0050]]) 
Onehot vector: tensor([0., 1., 0., 0., 0.])


 --------------------------------------- [Batch 0] --------------------------------------- 

Input shape: torch.Size([13, 56, 5]), Label shape: torch.Size([13, 5])

--Event 0.0-- 
[[0.25622955 0.         0.         0

In [158]:
''' 
Evaluation -------------------------------------------------------------------

[batch size][max sequence length][input size/number of features]
'''
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for sequences, labels in test_loader:
            labels = torch.argmax(labels, dim=1).to(DEVICE)
            outputs = model(sequences.to(DEVICE))
            predicted = torch.argmax(outputs, dim=1).to(DEVICE)

            print(predicted)
            print(labels, '\n')

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")

evaluate_model(model, test_loader)

tensor([1, 1, 3, 0, 0, 2, 4, 0, 1, 4, 4, 1, 3], device='cuda:0')
tensor([1, 1, 3, 0, 1, 2, 4, 1, 1, 0, 3, 1, 3], device='cuda:0') 

tensor([2, 3, 4, 3, 1, 3, 0, 4, 1, 2, 4, 4, 0], device='cuda:0')
tensor([2, 4, 4, 3, 1, 3, 2, 4, 0, 2, 4, 4, 0], device='cuda:0') 

tensor([1, 0, 1, 2, 3, 4, 1, 3, 2, 1, 2, 0, 3], device='cuda:0')
tensor([1, 0, 1, 2, 4, 4, 1, 1, 2, 1, 2, 1, 2], device='cuda:0') 

tensor([4, 1, 2, 4, 3, 4, 1, 0, 4, 3, 1, 2, 1], device='cuda:0')
tensor([3, 0, 1, 4, 3, 4, 1, 0, 0, 3, 1, 2, 1], device='cuda:0') 

tensor([2, 1, 0, 3, 1, 4, 3, 3, 4, 0, 3, 2, 2], device='cuda:0')
tensor([2, 1, 1, 3, 1, 1, 3, 2, 4, 0, 3, 2, 2], device='cuda:0') 

tensor([2, 4, 4, 3, 0, 1, 2, 1, 1, 4, 4, 1, 1], device='cuda:0')
tensor([2, 0, 4, 3, 0, 0, 3, 3, 0, 4, 4, 1, 0], device='cuda:0') 

tensor([1, 0, 4, 0, 4, 2, 1, 2, 4, 1, 3, 2, 1], device='cuda:0')
tensor([4, 0, 0, 0, 4, 4, 2, 3, 4, 3, 3, 2, 1], device='cuda:0') 

tensor([4, 3, 1, 4, 2, 3, 1, 1, 1, 4, 4, 1, 0], device='cuda:0')
tensor([4, 