In [1]:
import os
import sys
os.getcwd()

'c:\\Users\\ebaca\\Desktop\\Phys 417\\Final Project - HEP Tagging'

In [2]:
# importing libraries & making torch.device object for GPU

# neural network packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

from torch import Tensor
from torch.nn import Transformer
from torch.utils.data import Dataset, DataLoader, TensorDataset, Subset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pack_sequence, pad_sequence
from torch.utils.data.sampler import SubsetRandomSampler

# data packages
import numpy as np
import math
import pandas as pd
import sklearn.preprocessing as prep
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import fndict as fd

# visual packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tqdm
import warnings
import pprint as pp

# Create a torch.device object to tell pytorch where to store your tensors: cpu or gpu
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# files = shuffle([f for f in os.listdir('../../PHYS417_Project/data')])

# test_index = 0
# data = []

# for i, f in enumerate(files[:100]):
#     with np.load('../../PHYS417_Project/data/' + f) as data_load:
#         x_load = data_load['x']
#         y_load = data_load['y']
#         data.append([x_load, y_load])

#         if i == test_index:
#             print(x_load.shape, y_load, '\n')

# data = np.array(data, dtype=object)

# '''
# the dataset then contains these dimensions:
# data[event index][event information][event constituents]

# event index: 
# data[i] = [event1, event2, ..., eventn]
# - index for a specific event in a set number of sampled events
# - number of samples stays the same throughout a training session

# event information:
# data[i][0] = [[constituent1], [constituent2], ..., [constituentn]]
# data[i][1] = [one-hot encoded jet tag for 5 categories]
# - 0 for a variable number of constituents and their 5 properties (data['x'] of one event file)
# - 1 for the jet tag (represented as a one-hot encoded vector) (data['y'] of one event file)
# - always either 0 or 1, doesn't change ever

# event constituents: 
# data[i][0][constituentn] = [momentum, eta, phi, energy, distance]
# - varying number n for each event[i][0][n]
# - each represents a row of the nx5 matrix in event[i][0]

# '''

# class ParticleDataset(Dataset):
#     def __init__(self, data):
#         self.data = data
        
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         x, y = self.data[idx]
        
#         # Convert x to a list of tensors
#         x_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in x]
        
#         # Pack the sequence of tensors
#         packed_x = pack_sequence(x_tensors, enforce_sorted=False)
        
#         return packed_x, y

In [4]:
# print(f"Final shape of data_array: {data.shape}", f"\nShape X (features) {data[:, 0].shape}", f"\nShape Y (targets) {data[:, 1].shape}\n")

# print(data[test_index, 0].shape, data[test_index, 1]) # [data element index][0 for data_load['x'], 1 for data_load['y']][constituent index]

# # class ParticleDataset(Dataset):
# #     def __init__(self, data):
# #         self.data = data
        
# #     def __len__(self):
# #         return len(self.data)
    
# #     def __getitem__(self, idx):
# #         # Return a tuple containing features (x) and targets (y)
# #         return self.data[idx][0], self.data[idx][1]

In [121]:
# Define your model architecture
class JetClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(JetClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# custom PyTorch dataset (for DataLoader)
class JetDataset(Dataset):
    def __init__(self, files):
        self.files = files

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        with np.load(file_path) as data_load:
            x_load = torch.tensor(data_load['x'], dtype=torch.float32)
            y_load = torch.tensor(data_load['y'], dtype=torch.float32)
        return x_load, y_load

# the collate function lets the model handle variable-length sequences
def collate_fn(batch):
    # sorting batch in descending order of # of constituents
    # batch.sort(key=lambda x: x[0].size(0), reverse=True)   # option to sort by sequence length
    inputs, labels = zip(*batch)
    # adding padding to the sequences
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    # labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    labels = torch.stack(labels)  # Ensure labels are stacked without padding
    return inputs_padded, labels

# function to extract numpy arrays from tensors
extractor = lambda x: x.cpu().detach().numpy()



'''Extracting Files ----------------------------------------------------------'''

files = shuffle([os.path.join('../../PHYS417_Project/data/'+ f) for f in os.listdir('../../PHYS417_Project/data')])

print(f'First 70% of files: {len(files[:int(0.7*len(files))])} \nLast 30% of files: {len(files[int(0.7*len(files)):])}')



'''Splitting Files ----------------------------------------------------------'''

# splitting into training/testing sets
train_files = files[:int(0.7*len(files))]
test_files = files[int(0.7*len(files)):]

print(f"{len(train_files)} Training Files  + {len(test_files)} Testing Files  =  {len(files)} Total Files\n")
print(f'First 5 training files: {train_files[:5]} \nFirst 5 testing files: {test_files[:5]}\n')



'''Converting to Dataset Objects --------------------------------------------------------'''

train_dataset = JetDataset(train_files)
test_dataset = JetDataset(test_files)

# train_dataset[event n][input/label][constituent n, property n]
print(f'TrSet (event 0): {train_dataset[0][0].shape} \n{train_dataset[0][0][:1]} \nOnehot vector: {train_dataset[0][1]}\n')
# print(f'TeSet First const. (event 0): \n{test_dataset[0][0][:1]} \nOnehot vector: {test_dataset[0][1]}\n')



'''Creating DataLoaders/Applying Collate Fn (Padding) --------------------------------------------------------'''
# make datasets into DataLoader objects and apply collate_fn
train_loader = DataLoader(train_dataset, batch_size=13, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=13, shuffle=False, collate_fn=collate_fn)

# inspecting how padding went in collate_fn
# loader[input or label][batch n][event e][constituent n, property n]
n=0
for nbatch_x, nbatch_y in train_loader: # loops through n batches or loader[input/label][batch n]
    # inputs = array containing each events' data['x'] in the batch
    # labels = array containing each events' data['y'] in the batch
    print(f'\n --------------------------------------- [Batch {n}] --------------------------------------- \n')
    print(f'Input shape: {nbatch_x.shape}, Label shape: {nbatch_y.shape}\n') # general shape of the batch

    e=0
    # printing first 5 events in the batch
    for (properties, tag) in zip(nbatch_x[:2], nbatch_y[:2]):
        # printing the first 5 constituents/properties
        print(f'--Event {n}.{e}--',
              f'\n{extractor(properties[:1])} \n{extractor(properties[-1:])}, \nTag: {extractor(tag)}\n')
        e+=1
    
    # to stop after some number of events
    n+=1
    if n == 1: 
        break



'''
Initializing Model & Hyperparameters ----------------------------------------------------------
'''

# Initialize the model
input_size = 5  # Number of features for each constituent
hidden_size = 1280
num_layers = 1
num_classes = 5  # Number of categories for jet classification

model = JetClassifier(input_size, hidden_size, num_layers, num_classes).to(DEVICE)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss().to(DEVICE)  # Binary Cross Entropy Loss for multi-label classification
# criterion = nn.CrossEntropyLoss().to(DEVICE)  # includes softmax layer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



''' 
Training -------------------------------------------------------------------
'''
# [batch size][max sequence length][input size/number of features]

num_epochs = 10
for epoch in range(num_epochs):
    for events, tags in train_loader:
        events, onehots = events.to(DEVICE), tags.to(DEVICE)
        # tags = torch.argmax(onehots, dim=1) # Convert one-hot encoded labels to single integer labels

        optimizer.zero_grad()
        outputs = model(events)

        # loss = criterion(outputs, tags)
        loss = criterion(outputs, onehots)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    print(f'Inputs: {events.shape}  \nOnehot: {extractor(onehots[:1][0])}  ArgMax Index: {extractor(onehots[:1].argmax(dim=1))}')
    softmax = nn.Softmax(dim=1)
    probabilities = softmax(outputs)
    print(f'Outputs: {extractor(outputs[:1][0])}')
    print(f'Probabilities: {extractor(probabilities[:1][0])}')
    print('\n')


First 70% of files: 3512 
Last 30% of files: 1506
3512 Training Files  + 1506 Testing Files  =  5018 Total Files

First 5 training files: ['../../PHYS417_Project/data/event_0193504.npz', '../../PHYS417_Project/data/event_0676092.npz', '../../PHYS417_Project/data/event_0014163.npz', '../../PHYS417_Project/data/event_0552175.npz', '../../PHYS417_Project/data/event_0117166.npz'] 
First 5 testing files: ['../../PHYS417_Project/data/event_0859286.npz', '../../PHYS417_Project/data/event_0569447.npz', '../../PHYS417_Project/data/event_0130170.npz', '../../PHYS417_Project/data/event_0372130.npz', '../../PHYS417_Project/data/event_0232445.npz']

TrSet (event 0): torch.Size([42, 5]) 
tensor([[0.2411, 0.0000, 0.0000, 0.2419, 0.0480]]) 
Onehot vector: tensor([0., 0., 1., 0., 0.])


 --------------------------------------- [Batch 0] --------------------------------------- 

Input shape: torch.Size([13, 95, 5]), Label shape: torch.Size([13, 5])

--Event 0.0-- 
[[0.2411437  0.         0.         0.24

In [132]:
''' 
Evaluation -------------------------------------------------------------------

[batch size][max sequence length][input size/number of features]
'''
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for sequences, labels in test_loader:
            labels = torch.argmax(labels, dim=1).to(DEVICE)
            outputs = model(sequences.to(DEVICE))
            predicted = torch.argmax(outputs, dim=1).to(DEVICE)

            print(predicted)
            print(labels, '\n')

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")

evaluate_model(model, test_loader)

tensor([2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 2], device='cuda:0')
tensor([3, 2, 4, 0, 4, 3, 0, 0, 3, 1, 2, 1, 3], device='cuda:0') 

tensor([2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2], device='cuda:0')
tensor([1, 0, 1, 3, 2, 0, 0, 3, 0, 4, 0, 3, 0], device='cuda:0') 

tensor([4, 2, 2, 2, 2, 2, 0, 2, 4, 0, 2, 2, 4], device='cuda:0')
tensor([4, 3, 1, 3, 1, 2, 0, 4, 4, 4, 3, 1, 4], device='cuda:0') 

tensor([2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 4, 2], device='cuda:0')
tensor([1, 4, 2, 1, 4, 0, 0, 3, 2, 1, 1, 4, 1], device='cuda:0') 

tensor([4, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')
tensor([4, 1, 0, 2, 4, 0, 4, 1, 3, 2, 3, 2, 2], device='cuda:0') 

tensor([2, 0, 0, 2, 2, 0, 4, 2, 2, 2, 0, 0, 0], device='cuda:0')
tensor([2, 0, 3, 2, 3, 4, 4, 2, 3, 0, 4, 2, 0], device='cuda:0') 

tensor([2, 2, 2, 0, 2, 4, 2, 2, 2, 2, 2, 0, 4], device='cuda:0')
tensor([2, 4, 0, 0, 3, 4, 1, 2, 3, 1, 0, 0, 4], device='cuda:0') 

tensor([4, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2], device='cuda:0')
tensor([0, 