In [1]:
import os
import sys
os.getcwd()

'c:\\Users\\ebaca\\Desktop\\Phys 417\\Final Project - HEP Tagging'

In [2]:
# importing libraries & making torch.device object for GPU

# neural network packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

from torch import Tensor
# from torch.nn import Transformer
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.nn.utils.rnn import pad_sequence

# data packages
import numpy as np
import math
import pandas as pd
# import sklearn.preprocessing as prep
# from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# visual packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tqdm
import warnings

# Create a torch.device object to tell pytorch where to store your tensors: cpu or gpu
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
onehots = {
    (1, 0, 0, 0, 0): ['Gluon', 0],
    (0, 1, 0, 0, 0): ['Light Quark', 0],
    (0, 0, 1, 0, 0): ['W Boson', 0],
    (0, 0, 0, 1, 0): ['Z Boson', 0],
    (0, 0, 0, 0, 1): ['Top Quark', 0]
}
# print(list(onehots.values()))

files = shuffle([os.path.join('../../PHYS417_Project/data/'+ f) for f in os.listdir('../../PHYS417_Project/data')])

for i, f in enumerate(files):
    # with np.load('../../PHYS417_Project/data/' + f) as data_load:
    with np.load(f) as data_load:
        y = data_load['y']

        if tuple(y) in onehots:
            onehots[tuple(y)][1] += 1

display(onehots)

{(1, 0, 0, 0, 0): ['Gluon', 3942],
 (0, 1, 0, 0, 0): ['Light Quark', 3835],
 (0, 0, 1, 0, 0): ['W Boson', 4150],
 (0, 0, 0, 1, 0): ['Z Boson', 4053],
 (0, 0, 0, 0, 1): ['Top Quark', 4020]}

In [4]:
# Define your model architecture
class JetClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(JetClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# custom PyTorch dataset (for DataLoader)
class JetDataset(Dataset):
    def __init__(self, files):
        self.files = files

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        with np.load(file_path) as data_load:
            x_load = torch.tensor(data_load['x'], dtype=torch.float32)
            y_load = torch.tensor(data_load['y'], dtype=torch.float32)
        return x_load, y_load

# the collate function lets the model handle variable-length sequences
def collate_fn(batch):
    # sorting batch in descending order of # of constituents
    # batch.sort(key=lambda x: x[0].size(0), reverse=True)   # option to sort by sequence length
    inputs, labels = zip(*batch)
    # adding padding to the sequences
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    # labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    labels = torch.stack(labels)  # Ensure labels are stacked without padding
    return inputs_padded, labels

# function to extract numpy arrays from tensors
extractor = lambda x: x.cpu().detach().numpy()



'''Extracting Files ----------------------------------------------------------'''

print(f'First 70% of files: {len(files[:int(0.7*len(files))])} \nLast 30% of files: {len(files[int(0.7*len(files)):])}')



'''Splitting Files ----------------------------------------------------------'''

# splitting into training/testing sets
train_files = files[:int(0.7*len(files))]
test_files = files[int(0.7*len(files)):]

print(f"{len(train_files)} Training Files  + {len(test_files)} Testing Files  =  {len(files)} Total Files\n")
print(f'First 5 training files: {train_files[:5]} \nFirst 5 testing files: {test_files[:5]}\n')



'''Converting to Dataset Objects --------------------------------------------------------'''

train_dataset = JetDataset(train_files)
test_dataset = JetDataset(test_files)

# train_dataset[event n][input/label][constituent n, property n]
print(f'TrSet (event 0): {train_dataset[0][0].shape} \n{train_dataset[0][0][:1]} \nOnehot vector: {train_dataset[0][1]}\n')
# print(f'TeSet First const. (event 0): \n{test_dataset[0][0][:1]} \nOnehot vector: {test_dataset[0][1]}\n')



'''Creating DataLoaders/Applying Collate Fn (Padding) --------------------------------------------------------'''

class_weights = [1 / count for count in [value[1] for value in onehots.values()]]
weights = [
    class_weights[torch.argmax(tag).item()] 
        for tag in [train_dataset[i][1] 
                for i in range(len(train_dataset))]
]
sampler = WeightedRandomSampler(weights, len(train_dataset), replacement=True)

# make datasets into DataLoader objects and apply collate_fn
train_loader = DataLoader(train_dataset, batch_size=13, shuffle=False, collate_fn=collate_fn, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=13, shuffle=False, collate_fn=collate_fn, sampler=sampler)

# inspecting how padding went in collate_fn
# loader[input or label][batch n][event e][constituent n, property n]
n=0
for nbatch_x, nbatch_y in train_loader: # loops through n batches or loader[input/label][batch n]
    # inputs = array containing each events' data['x'] in the batch
    # labels = array containing each events' data['y'] in the batch
    print(f'\n --------------------------------------- [Batch {n}] --------------------------------------- \n')
    print(f'Input shape: {nbatch_x.shape}, Label shape: {nbatch_y.shape}\n') # general shape of the batch

    e=0
    # printing first 5 events in the batch
    for (properties, tag) in zip(nbatch_x[:2], nbatch_y[:2]):
        # printing the first 5 constituents/properties
        print(f'--Event {n}.{e}--',
              f'\n{extractor(properties[:1])} \n{extractor(properties[-1:])}, \nTag: {extractor(tag)}\n')
        e+=1
    
    # to stop after some number of events
    n+=1
    if n == 1: 
        break



'''
Initializing Model & Hyperparameters ----------------------------------------------------------
'''

# Initialize the model
input_size = 5  # Number of features for each constituent
hidden_size = 1280
num_layers = 1
num_classes = 5  # Number of categories for jet classification

model = JetClassifier(input_size, hidden_size, num_layers, num_classes).to(DEVICE)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss().to(DEVICE)  # Binary Cross Entropy Loss for multi-label classification
# criterion = nn.CrossEntropyLoss().to(DEVICE)  # includes softmax layer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



''' 
Training -------------------------------------------------------------------
'''
# [batch size][max sequence length][input size/number of features]

num_epochs = 10
for epoch in tqdm.trange(num_epochs):
    for events, tags in train_loader:
        events, onehots = events.to(DEVICE), tags.to(DEVICE)
        # tags = torch.argmax(onehots, dim=1) # Convert one-hot encoded labels to single integer labels

        optimizer.zero_grad()
        outputs = model(events)

        # loss = criterion(outputs, tags)
        loss = criterion(outputs, onehots)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    print(f'Inputs: {events.shape}  \nOnehot: {extractor(onehots[:1][0])}  ArgMax Index: {extractor(onehots[:1].argmax(dim=1))}')
    softmax = nn.Softmax(dim=1)
    probabilities = softmax(outputs)
    print(f'Outputs: {extractor(outputs[:1][0])}')
    print(f'Probabilities: {extractor(probabilities[:1][0])}')
    print('\n')


First 70% of files: 14000 
Last 30% of files: 6000
14000 Training Files  + 6000 Testing Files  =  20000 Total Files

First 5 training files: ['../../PHYS417_Project/data/event_0210843.npz', '../../PHYS417_Project/data/event_0853950.npz', '../../PHYS417_Project/data/event_0150345.npz', '../../PHYS417_Project/data/event_0063113.npz', '../../PHYS417_Project/data/event_0629457.npz'] 
First 5 testing files: ['../../PHYS417_Project/data/event_0572190.npz', '../../PHYS417_Project/data/event_0383752.npz', '../../PHYS417_Project/data/event_0119255.npz', '../../PHYS417_Project/data/event_0786239.npz', '../../PHYS417_Project/data/event_0612971.npz']

TrSet (event 0): torch.Size([31, 5]) 
tensor([[0.7103, -0.0000, 0.0000, 0.7085, 0.0018]]) 
Onehot vector: tensor([0., 0., 0., 1., 0.])


 --------------------------------------- [Batch 0] --------------------------------------- 

Input shape: torch.Size([13, 97, 5]), Label shape: torch.Size([13, 5])

--Event 0.0-- 
[[ 0.1969187  -0.          0.      

 10%|█         | 1/10 [00:26<03:58, 26.52s/it]

Epoch [1/10], Loss: 0.4991
Inputs: torch.Size([12, 127, 5])  
Onehot: [0. 0. 0. 1. 0.]  ArgMax Index: [3]
Outputs: [-1.641184  -1.2442324 -1.4228164 -1.3074638 -1.4104962]
Probabilities: [0.15656933 0.23286304 0.19477917 0.21859467 0.19719374]




 20%|██        | 2/10 [00:52<03:31, 26.40s/it]

Epoch [2/10], Loss: 0.4954
Inputs: torch.Size([12, 102, 5])  
Onehot: [0. 1. 0. 0. 0.]  ArgMax Index: [1]
Outputs: [-1.5564519 -1.1370878 -1.246305  -1.3278059 -1.7185264]
Probabilities: [0.16689213 0.25384194 0.22757836 0.20976622 0.14192137]




 30%|███       | 3/10 [01:18<03:02, 26.09s/it]

Epoch [3/10], Loss: 0.4876
Inputs: torch.Size([12, 96, 5])  
Onehot: [0. 0. 1. 0. 0.]  ArgMax Index: [2]
Outputs: [-2.1131055  -1.0156972  -0.84513044 -1.2548186  -1.8590342 ]
Probabilities: [0.08929832 0.2675726  0.31733492 0.21066509 0.11512908]




 40%|████      | 4/10 [01:44<02:35, 25.95s/it]

Epoch [4/10], Loss: 0.4529
Inputs: torch.Size([12, 95, 5])  
Onehot: [1. 0. 0. 0. 0.]  ArgMax Index: [0]
Outputs: [-0.28520775 -2.9130874  -3.5405474  -3.3613315  -0.50127983]
Probabilities: [0.5095245  0.0368037  0.0196512  0.02350832 0.4105123 ]




 50%|█████     | 5/10 [02:09<02:08, 25.69s/it]

Epoch [5/10], Loss: 0.4512
Inputs: torch.Size([12, 81, 5])  
Onehot: [0. 1. 0. 0. 0.]  ArgMax Index: [1]
Outputs: [-1.3457868 -1.38286   -1.0999743 -1.1412121 -2.3141077]
Probabilities: [0.20622857 0.19872302 0.2636962  0.2530431  0.07830913]




 60%|██████    | 6/10 [02:34<01:42, 25.53s/it]

Epoch [6/10], Loss: 0.3255
Inputs: torch.Size([12, 88, 5])  
Onehot: [0. 1. 0. 0. 0.]  ArgMax Index: [1]
Outputs: [-0.23390654 -1.8668593  -2.4021192  -2.1355963  -1.6707519 ]
Probabilities: [0.58937025 0.11513459 0.06741328 0.08800247 0.14007936]




 70%|███████   | 7/10 [03:00<01:16, 25.44s/it]

Epoch [7/10], Loss: 0.2908
Inputs: torch.Size([12, 97, 5])  
Onehot: [0. 0. 0. 0. 1.]  ArgMax Index: [4]
Outputs: [-2.3323786 -2.1303637 -5.2832727 -3.8702602  1.2144938]
Probabilities: [0.0268849  0.03290352 0.00140589 0.00577584 0.93302983]




 80%|████████  | 8/10 [03:25<00:50, 25.43s/it]

Epoch [8/10], Loss: 0.3519
Inputs: torch.Size([12, 57, 5])  
Onehot: [0. 1. 0. 0. 0.]  ArgMax Index: [1]
Outputs: [-3.1431987   0.09539345 -1.1318198  -1.5311161  -4.1771116 ]
Probabilities: [0.02541925 0.6481349  0.18997368 0.12743281 0.00903941]




 90%|█████████ | 9/10 [03:50<00:25, 25.43s/it]

Epoch [9/10], Loss: 0.3667
Inputs: torch.Size([12, 84, 5])  
Onehot: [0. 0. 1. 0. 0.]  ArgMax Index: [2]
Outputs: [-0.6090559  -0.50906336 -1.752686   -2.2136772  -3.8611157 ]
Probabilities: [0.37544826 0.4149314  0.11964039 0.0754522  0.01452775]




100%|██████████| 10/10 [04:16<00:00, 25.62s/it]

Epoch [10/10], Loss: 0.3045
Inputs: torch.Size([12, 98, 5])  
Onehot: [0. 1. 0. 0. 0.]  ArgMax Index: [1]
Outputs: [-3.1691008  1.4804101 -2.38379   -2.7353785 -4.8923864]
Probabilities: [0.00913673 0.95509696 0.02003765 0.01409787 0.00163071]







In [6]:
''' 
Evaluation -------------------------------------------------------------------

[batch size][max sequence length][input size/number of features]
'''
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for sequences, labels in test_loader:
            labels = torch.argmax(labels, dim=1).to(DEVICE)
            outputs = model(sequences.to(DEVICE))
            predicted = torch.argmax(outputs, dim=1).to(DEVICE)

            print(predicted)
            print(labels, '\n')

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")

evaluate_model(model, test_loader)

IndexError: list index out of range