# Building a h5 dataset from a ROOT file

In [79]:
import uproot
import h5py
import numpy as np
from tqdm import tqdm #This module gives you a live progress bar for a loop


Load the data using uproot as we have normally. Load the arrays

In [3]:
input_filename = "Samples/ttbar_200k_dilep.root"
input_ttree_name = "nominal"

In [73]:
tree =  uproot.open(f"{input_filename}:{input_ttree_name}") 

In [74]:
Nevents = tree.num_entries
jet_pt    = tree["jet_pt"].array(library='np')
jet_eta   = tree["jet_eta"].array(library='np')
jet_phi   = tree["jet_phi"].array(library='np')
jet_e     = tree["jet_e"].array(library='np')


In [75]:
# Get the maximum number of jets in any event. There is definitely a faster way
# to do this
max_jets = 0
for i in range(Nevents):
    if len(jet_pt[i]) > max_jets:
        max_jets = len(jet_pt[i])

We create numpy arrays of custom data-type:
First create a specific data type for four-momenta

In [76]:
four_momenta_dt = np.dtype([('pt', np.float32), ('eta', np.float32), ('phi', np.float32), ('e', np.float32)])

We need to create some empty numpy arrays to fill, of dimension max_jets columns
and Nevents rows

In [77]:
jet_data = np.zeros((Nevents, max_jets), dtype=four_momenta_dt)

Fill the `jet_data` numpy array from the loaded data

In [78]:
# Loop over all entries in the array, using a for loop with index
for i in tqdm(range(Nevents)):
    num_jets = len(jet_pt[i])
    for j in range(num_jets):
        jet_data[i][j] = (jet_pt[i][j], jet_eta[i][j], jet_phi[i][j],  jet_e[i][j])

        

100%|██████████| 100000/100000 [00:00<00:00, 157581.02it/s]


`jet_data` has now been populated

In [71]:
print(jet_data)

[[( 98.194275,  0.80488354,  0.01690417, 132.37424 )
  ( 97.47331 , -1.1130863 , -2.9167042 , 164.60107 )
  ( 25.289927, -1.5169365 ,  0.08937796,  60.50155 ) ...
  (  0.      ,  0.        ,  0.        ,   0.      )
  (  0.      ,  0.        ,  0.        ,   0.      )
  (  0.      ,  0.        ,  0.        ,   0.      )]
 [(165.24458 ,  1.6151805 ,  1.5789794 , 432.06485 )
  ( 55.44954 , -0.6764963 ,  2.6380312 ,  68.86536 )
  ( 52.215954,  2.2815974 , -2.1463335 , 258.3971  ) ...
  (  0.      ,  0.        ,  0.        ,   0.      )
  (  0.      ,  0.        ,  0.        ,   0.      )
  (  0.      ,  0.        ,  0.        ,   0.      )]
 [(161.18646 , -1.3344725 ,  2.4141598 , 327.78928 )
  ( 80.61672 ,  0.92472154,  2.3028657 , 118.165184)
  (  0.      ,  0.        ,  0.        ,   0.      ) ...
  (  0.      ,  0.        ,  0.        ,   0.      )
  (  0.      ,  0.        ,  0.        ,   0.      )
  (  0.      ,  0.        ,  0.        ,   0.      )]
 ...
 [( 48.538033, -0.44061795

In [81]:
h5_file = h5py.File("example_output.h5", 'w')
group = h5_file.create_group('Objects')
group.create_dataset("jet", data=jet_data)
h5_file.close()