In [1]:
import pyarrow.parquet as pq
import pyarrow as pa

# Open the parquet file
pf = pq.ParquetFile('processed_events/normalized_features_all.parquet')

# Read first 5 rows using iter_batches
first_batch = next(pf.iter_batches(batch_size=5))

# Convert to pandas DataFrame
df = pa.Table.from_batches([first_batch]).to_pandas()

df


Unnamed: 0,kinetic_energy,primary_kinetic_energy,X_transformed,Y_transformed,Z_transformed,distance,time_transformed,sin_azimuth,cos_azimuth,sin_zenith,cos_zenith,event_id,pdg,plane
0,-0.2058,-1.490094,0.601975,-0.010381,0.077629,0.079031,0.554574,0.105454,0.994424,0.046285,0.998928,33619162.0,13.0,18.0
1,-0.351933,-1.490094,0.49733,-0.436993,-0.122028,-0.1207,0.504512,0.105454,0.994424,0.046285,0.998928,33619162.0,13.0,17.0
2,0.937462,-1.490094,-0.177233,0.073647,-1.519858,-1.518812,0.295731,0.105454,0.994424,0.046285,0.998928,33619162.0,11.0,10.0
3,-0.823414,-1.490094,1.294355,0.044384,1.275315,1.277413,0.75867,0.105454,0.994424,0.046285,0.998928,33619162.0,22.0,0.0
4,0.047583,-1.490094,-0.394172,0.037591,-1.919142,-1.918272,0.207371,0.105454,0.994424,0.046285,0.998928,33619162.0,22.0,8.0


In [None]:
from nn import ShowerDataset

pdg_classes = [113, 22]
hit_file = "../ml/processed_events/normalized_features_z_3.parquet"
n_planes = 24

full_dataset = ShowerDataset(hit_file, pdg_classes, n_planes=n_planes, normalize_events=True)

In [3]:
X_event, y_pdg, y_event, eid = full_dataset[0]

In [4]:
# features
print(X_event.shape)
X_event

torch.Size([9637, 10])


tensor([[-0.2167, -1.4910,  0.7241,  ...,  0.9944,  0.0463,  0.9989],
        [-0.3636, -1.4910,  0.5984,  ...,  0.9944,  0.0463,  0.9989],
        [ 0.9325, -1.4910, -0.2120,  ...,  0.9944,  0.0463,  0.9989],
        ...,
        [-0.4883, -1.4910,  1.4600,  ...,  0.9944,  0.0463,  0.9989],
        [ 0.2629, -1.4910, -0.3449,  ...,  0.9944,  0.0463,  0.9989],
        [-0.9302, -1.4910,  0.9090,  ...,  0.9944,  0.0463,  0.9989]])

In [5]:
# pdg labels
print(y_pdg.shape)
y_pdg

torch.Size([9637])


tensor([1, 1, 0,  ..., 2, 2, 2])

In [6]:
# pdg labels
print(y_event.shape)
y_event

torch.Size([72])


tensor([-0.4638,  0.0000,  0.0000, -0.6967, -0.8520, -0.4403, -0.4071,  0.0401,
         0.7275,  0.8144,  0.3814,  0.7015,  1.0080,  0.7554,  1.4223,  0.6572,
         0.5817,  0.6211,  0.3052, -0.1175, -0.0334, -0.4407, -0.4738, -0.8192,
        -0.1374,  0.0000,  0.0000, -0.6126, -0.7564, -0.2625, -0.0102,  0.2183,
         0.8354,  1.2027,  1.1399,  1.8008,  1.4033,  1.9956,  1.2488,  1.8098,
         1.5405,  0.8575,  1.1532,  0.7627,  0.3966,  0.4776,  0.1276, -0.1104,
        -0.6776,  0.0000,  0.0000, -0.7903, -0.9128, -0.2298, -0.3656,  0.1490,
         0.2732,  0.3530,  0.4148,  0.6826,  0.7298,  0.8696,  0.8157,  0.4673,
        -0.0743, -0.1007, -0.0502, -0.3481, -0.6152, -0.9015, -0.8071, -0.6008])

In [16]:
# event ID
eid

33619162.0

In [2]:
from nn import ShowerNetMultiTask
from nn import ShowerDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchviz import make_dot
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split

dummy_input = torch.randn(1, 784)  # Batch size of 1, 784 input features
hidden_dim=128
n_planes=24
pdg_classes = [11, 13, 22]
hit_file = "../ml/processed_events/normalized_features_z_3.parquet"
full_dataset = ShowerDataset(hit_file, pdg_classes, n_planes=n_planes, normalize_events=True)
event_ids = full_dataset.event_ids

test_eid = event_ids[0]
remaining_ids = event_ids[1:]
train_ids, val_ids = train_test_split(remaining_ids, test_size=0.1, random_state=42)

id_to_idx = {eid: i for i, eid in enumerate(event_ids)}
train_idx = [id_to_idx[eid] for eid in train_ids]
val_idx = [id_to_idx[eid] for eid in val_ids]
test_idx = [id_to_idx[test_eid]]

train_dataset = Subset(full_dataset, train_idx)
val_dataset = Subset(full_dataset, val_idx)
test_dataset = Subset(full_dataset, test_idx)

input_dim = len(full_dataset.feature_cols)
n_classes = len(pdg_classes)
n_event_outputs = len(pdg_classes)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = ShowerNetMultiTask(input_dim, hidden_dim, n_classes, n_event_outputs, n_planes=n_planes)

loader = DataLoader(test_dataset, batch_size=1, shuffle=False,
                        collate_fn=lambda x: list(zip(*x)))
    
for X_batch, y_pdg_batch, y_event_batch, eids in loader:
    output = model(X_batch)
    break


Using device: cpu


In [3]:
output = model(X_batch)  # type: tuple
print(type(output), [type(x) for x in output])


<class 'tuple'> [<class 'list'>, <class 'torch.Tensor'>]


In [4]:
dot = make_dot(output[1].mean(), params=dict(model.named_parameters()))  # Correct usage


# Save or display the generated graph
dot.format = 'png'
dot.render('simple_net')

'simple_net.png'

In [6]:
output[1].shape

torch.Size([1, 24, 3])

In [10]:
output[0][0].shape

torch.Size([9637, 3])