# CaloChallenge dataset reformatting

In [1]:
# imports
from HighLevelFeatures import HighLevelFeatures as HLF
import numpy as np
import h5py
import matplotlib.pyplot as plt

# Dataset 1:

In [2]:
# creating instance of HighLevelFeatures class to handle geometry based on binning file
HLF_1_photons = HLF('photon', filename='binning_dataset_1_photons.xml')
HLF_1_pions = HLF('pion', filename='binning_dataset_1_pions.xml')

In [3]:
# loading the .hdf5 datasets
#photon1_file = h5py.File('../dataset_1_photons_1.hdf5', 'r')
photon1_file = h5py.File('../CaloFlow_gamma_student_test.hdf5', 'r')
photon2_file = h5py.File('../dataset_1_photons_2.hdf5', 'r')
#pion_file = h5py.File('../dataset_1_pions_1.hdf5', 'r')
pion_file = h5py.File('../CaloFlow_piplus_teacher.hdf5', 'r')

In [4]:
# each file contains one dataset for the incident energy and one for the showers.
print("photon_1:")
for dataset in photon1_file:
    # name of the datasets:
    print("dataset name: ", dataset)
    print("dataset shape:", photon1_file[dataset][:].shape)
print('\n')
print("photon_2:")
for dataset in photon2_file:
    # name of the datasets:
    print("dataset name: ", dataset)
    print("dataset shape:", photon2_file[dataset][:].shape)
print('\n')
print("pion:")
for dataset in pion_file:
    # name of the datasets:
    print("dataset name: ", dataset)
    print("dataset shape:", pion_file[dataset][:].shape)

photon_1:
dataset name:  incident_energies
dataset shape: (121000, 1)
dataset name:  showers
dataset shape: (121000, 368)


photon_2:
dataset name:  incident_energies
dataset shape: (121000, 1)
dataset name:  showers
dataset shape: (121000, 368)


pion:
dataset name:  incident_energies
dataset shape: (120230, 1)
dataset name:  showers
dataset shape: (120230, 533)


<font size="5">__Photon_1:__</font>

In [5]:
# save total number of events in dataset_1_photons_1.hdf5
n_events = photon1_file["incident_energies"].shape[0]
print('Total number of events: ',n_events)

# save number of alpha bins in each layer
num_alpha = HLF_1_photons.num_alpha
print('Number of alpha bins per layer: ', num_alpha)

#save the r edges in each layer
r_edges = HLF_1_photons.r_edges
print('r_edges: ', r_edges)

#save total number of calorimeter layers
n_layers = len(num_alpha)
print('Number of calorimeter layers: ',n_layers)

Total number of events:  121000
Number of alpha bins per layer:  [1, 10, 10, 1, 1]
r_edges:  [[0.0, 5.0, 10.0, 30.0, 50.0, 100.0, 200.0, 400.0, 600.0], [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 15.0, 20.0, 30.0, 40.0, 50.0, 70.0, 90.0, 120.0, 150.0, 200.0], [0.0, 2.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 40.0, 50.0, 60.0, 80.0, 100.0, 130.0, 160.0, 200.0, 250.0, 300.0, 350.0, 400.0], [0.0, 50.0, 100.0, 200.0, 400.0, 600.0], [0.0, 100.0, 200.0, 400.0, 1000.0, 2000.0]]
Number of calorimeter layers:  5


In [6]:
# save number of r bins per layer
r_bins = []
for layer in range(n_layers):
    r_bins.append(len(r_edges[layer])-1)
print('Number of radius bins per layer: ', r_bins)

Number of radius bins per layer:  [8, 16, 19, 5, 5]


In [7]:
# save layer edges
layer_edges = []
for layer in range(n_layers):
    layer_edges.append(num_alpha[layer]*r_bins[layer])
print('Layer edges: ',layer_edges)

Layer edges:  [8, 160, 190, 5, 5]


<font color='blue' size=3>Next, we reformat the dataset to have the structure: 'energies', 'layer_0', 'layer_1',...,'layer_4'. Note that 'energies' has format (n_events,1), while 'layer_i' has format (n_events,num_alpha_i*r_bins_i)</font>

In [8]:
# format 'energies' and store incident energies
energies = photon1_file["incident_energies"]
print('energies shape: ',energies.shape)
print(energies)
#format 'layer_0' and store energies
layer_0 = photon1_file["showers"][:,:layer_edges[0]]
print('layer_0 shape: ',layer_0.shape)

#format 'layer_1' and store energies
layer_1 = photon1_file["showers"][:,(layer_edges[0]):(layer_edges[0]+layer_edges[1])]
print('layer_1 shape: ',layer_1.shape)

#format 'layer_2' and store energies
layer_2 = photon1_file["showers"][:,(layer_edges[0]+layer_edges[1]):(layer_edges[0]+layer_edges[1]+layer_edges[2])]
print('layer_2 shape: ',layer_2.shape)

#format 'layer_3' and store energies
layer_3 = photon1_file["showers"][:,(layer_edges[0]+layer_edges[1]+layer_edges[2]):(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3])]
print('layer_3 shape: ',layer_3.shape)

#format 'layer_4' and store energies
layer_4 = photon1_file["showers"][:,(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]):(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]+layer_edges[4])]
print('layer_4 shape: ',layer_4.shape)

energies shape:  (121000, 1)
<HDF5 dataset "incident_energies": shape (121000, 1), type "<f8">
layer_0 shape:  (121000, 8)
layer_1 shape:  (121000, 160)
layer_2 shape:  (121000, 190)
layer_3 shape:  (121000, 5)
layer_4 shape:  (121000, 5)


In [9]:
#energies_idx = np.argsort(energies[:],axis=0)
#energies_idx = np.squeeze(energies_idx)
#print(energies_idx)
#energies_sorted=np.sort(energies[:],axis=0)
#print(energies_sorted)
#layer_0_sorted = layer_0[energies_idx,:]
#layer_1_sorted = layer_1[energies_idx,:]
#layer_2_sorted = layer_2[energies_idx,:]
#layer_3_sorted = layer_3[energies_idx,:]
#layer_4_sorted = layer_4[energies_idx,:]

#energies_high = energies_sorted[110000:]
#layer_0_high = layer_0_sorted[110000:]
#layer_1_high = layer_1_sorted[110000:]
#layer_2_high = layer_2_sorted[110000:]
#layer_3_high = layer_3_sorted[110000:]
#layer_4_high = layer_4_sorted[110000:]

#print(energies_high.shape)
#print(layer_0_high.shape)
#print(layer_1_high.shape)
#print(layer_2_high.shape)
#print(layer_3_high.shape)
#print(layer_4_high.shape)



In [10]:
# Create reformatted datafile
filename = './reformatted_data/gamma_student.hdf5' #rename accordingly
save_file = h5py.File(filename, 'w')

save_file.create_dataset('layer_0', data=layer_0)
save_file.create_dataset('layer_1', data=layer_1)
save_file.create_dataset('layer_2', data=layer_2)
save_file.create_dataset('layer_3', data=layer_3)
save_file.create_dataset('layer_4', data=layer_4)
save_file.create_dataset('energy', data=energies)

#save_file.create_dataset('layer_0', data=layer_0_high)
#save_file.create_dataset('layer_1', data=layer_1_high)
#save_file.create_dataset('layer_2', data=layer_2_high)
#save_file.create_dataset('layer_3', data=layer_3_high)
#save_file.create_dataset('layer_4', data=layer_4_high)
#save_file.create_dataset('energy', data=energies_high)
    
save_file.close()

# Pions:

In [5]:
# save total number of events in dataset_1_photons_1.hdf5
n_events = pion_file["incident_energies"].shape[0]
print('Total number of events: ',n_events)

# save number of alpha bins in each layer
num_alpha = HLF_1_pions.num_alpha
print('Number of alpha bins per layer: ', num_alpha)

#save the r edges in each layer
r_edges = HLF_1_pions.r_edges
print('r_edges: ', r_edges)

#save total number of calorimeter layers
n_layers = len(num_alpha)
print('Number of calorimeter layers: ',n_layers)

Total number of events:  120230
Number of alpha bins per layer:  [1, 10, 10, 1, 10, 10, 1]
r_edges:  [[0.0, 5.0, 10.0, 30.0, 50.0, 100.0, 200.0, 400.0, 600.0], [0.0, 1.0, 4.0, 7.0, 10.0, 15.0, 30.0, 50.0, 90.0, 150.0, 200.0], [0.0, 5.0, 10.0, 20.0, 30.0, 50.0, 80.0, 130.0, 200.0, 300.0, 400.0], [0.0, 50.0, 100.0, 200.0, 400.0, 600.0], [0.0, 10.0, 20.0, 30.0, 50.0, 80.0, 100.0, 130.0, 160.0, 200.0, 250.0, 300.0, 350.0, 400.0, 1000.0, 2000.0], [0.0, 10.0, 20.0, 30.0, 50.0, 80.0, 100.0, 130.0, 160.0, 200.0, 250.0, 300.0, 350.0, 400.0, 600.0, 1000.0, 2000.0], [0.0, 50.0, 100.0, 150.0, 200.0, 250.0, 300.0, 400.0, 600.0, 1000.0, 2000.0]]
Number of calorimeter layers:  7


In [6]:
# save number of r bins per layer
r_bins = []
for layer in range(n_layers):
    r_bins.append(len(r_edges[layer])-1)
print('Number of radius bins per layer: ', r_bins)

Number of radius bins per layer:  [8, 10, 10, 5, 15, 16, 10]


In [7]:
# save layer edges
layer_edges = []
for layer in range(n_layers):
    layer_edges.append(num_alpha[layer]*r_bins[layer])
print('Layer edges: ',layer_edges)

Layer edges:  [8, 100, 100, 5, 150, 160, 10]


<font color='blue' size=3>Next, we reformat the dataset to have the structure: 'energies', 'layer_0', 'layer_1',...,'layer_6'. Note that 'energies' has format (n_events,1), while 'layer_i' has format (n_events,num_alpha_i*r_bins_i)</font>

In [8]:
# format 'energies' and store incident energies
energies = pion_file["incident_energies"]
print('energies shape: ',energies.shape)
print(energies)
#format 'layer_0' and store energies
layer_0 = pion_file["showers"][:,:layer_edges[0]]
print('layer_0 shape: ',layer_0.shape)

#format 'layer_1' and store energies
layer_1 = pion_file["showers"][:,(layer_edges[0]):(layer_edges[0]+layer_edges[1])]
print('layer_1 shape: ',layer_1.shape)

#format 'layer_2' and store energies
layer_2 = pion_file["showers"][:,(layer_edges[0]+layer_edges[1]):(layer_edges[0]+layer_edges[1]+layer_edges[2])]
print('layer_2 shape: ',layer_2.shape)

#format 'layer_3' and store energies
layer_3 = pion_file["showers"][:,(layer_edges[0]+layer_edges[1]+layer_edges[2]):(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3])]
print('layer_3 shape: ',layer_3.shape)

#format 'layer_4' and store energies
layer_4 = pion_file["showers"][:,(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]):(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]+layer_edges[4])]
print('layer_4 shape: ',layer_4.shape)

#format 'layer_5' and store energies
layer_5 = pion_file["showers"][:,(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]+layer_edges[4]):(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]+layer_edges[4]+layer_edges[5])]
print('layer_5 shape: ',layer_5.shape)

#format 'layer_6' and store energies
layer_6 = pion_file["showers"][:,(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]+layer_edges[4]+layer_edges[5]):(layer_edges[0]+layer_edges[1]+layer_edges[2]+layer_edges[3]+layer_edges[4]+layer_edges[5]+layer_edges[6])]
print('layer_6 shape: ',layer_6.shape)

energies shape:  (120230, 1)
<HDF5 dataset "incident_energies": shape (120230, 1), type "<f8">
layer_0 shape:  (120230, 8)
layer_1 shape:  (120230, 100)
layer_2 shape:  (120230, 100)
layer_3 shape:  (120230, 5)
layer_4 shape:  (120230, 150)
layer_5 shape:  (120230, 160)
layer_6 shape:  (120230, 10)


In [9]:
# Create reformatted datafile
filename = './reformatted_data/piplus_teacher.hdf5' #rename accordingly
save_file = h5py.File(filename, 'w')

save_file.create_dataset('layer_0', data=layer_0)
save_file.create_dataset('layer_1', data=layer_1)
save_file.create_dataset('layer_2', data=layer_2)
save_file.create_dataset('layer_3', data=layer_3)
save_file.create_dataset('layer_4', data=layer_4)
save_file.create_dataset('layer_5', data=layer_5)
save_file.create_dataset('layer_6', data=layer_6)
save_file.create_dataset('energy', data=energies)
    
save_file.close()