In [14]:
#!pip install numpy uproot

In [15]:
import awkward as ak
import numpy as np
import uproot as uproot

In [16]:
file = uproot.open('merged.root')
file.keys()

['lCToTSTsAssoc;1', 'lCToTSTsAssoc/lCTo3simTS_tree;1']

In [17]:
events = file['lCToTSTsAssoc/lCTo3simTS_tree']
events.show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
run                  | int32_t                  | AsDtype('>i4')
event                | int32_t                  | AsDtype('>i4')
lumi                 | int32_t                  | AsDtype('>i4')
id                   | std::vector<int32_t>     | AsJagged(AsDtype('>i4'), he...
pos_x                | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
pos_y                | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
pos_z                | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
energy               | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
time                 | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
time_error           | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
eta                  | std::vector<float>       | AsJagged(AsDtype('>f4'), h

## Input Features

 - `event`: the index of the events in the file
 
Each event contains a varying number of point. For each point:
 - `pos_x`, `pos_y`, `pos_z`: (x,y,z) coordinates of the point
 - `time`: time of the point (not all points have timing information -- a value of -99 indicates no timing information)
 - `eta`, `phi`: [pseudorapidity](https://en.wikipedia.org/wiki/Pseudorapidity), azimuth angle of the point
 - `layer`: layer number of the point
 - `energy`: energy of the point

**[Note]** `simTst_idx` and `enFraction` are the truth labels.

These features are [JaggedArray](https://github.com/scikit-hep/awkward-0.x). This can be thought of as an array of arrays, where the first level of indices correspond to the event indices, and the second level correspond to the LCs within each event.

In [18]:
# To get the total number of events in the file
len(events['event'].array())

3800

In [19]:
# To get the energies of all points from all events in the file
energy = events['energy'].array()
energy

<Array [[0.0447, 0.0599, ... 0.105, 0.0917]] type='3800 * var * float32'>

In [20]:
# To access energy of all points from the first event: this returns a regular numpy array
energy[0]

<Array [0.0447, 0.0599, ... 0.0501, 0.363] type='550 * float32'>

In [21]:
# To get the number of points in each event
ak.num(energy)

<Array [550, 562, 624, 440, ... 683, 505, 613] type='3800 * int64'>

In [22]:
# to pad/truncate to a regular 2D array
ak.pad_none(energy, 2, clip=True)

<Array [[0.0447, 0.0599], ... [0.0546, 0.0148]] type='3800 * 2 * ?float32'>

In [23]:
# To flatten the jagged array to a 1d array
ak.flatten(energy)

<Array [0.0447, 0.0599, ... 0.105, 0.0917] type='1864424 * float32'>

In [24]:
# To get the total number of points within the events
len(ak.flatten(energy))

1864424

### Truth definition:

The target is to assign each point to up to 3 particle showers - aka Tracksters - sorted by the higher fraction of point's energy contained. The truth information is as follows:

 - `simTst_idx`: index of the (up to 3) showers associated with the point at hand; if the point is associated to less than 3 showers, the remaining array elements are -1;
 - `enFraction`: the fraction of the point's energy that should be assigned to each of the (up to 3) showers.

In [25]:
simTst_idx = events['simTst_idx'].array()
simTst_idx[0]

<Array [[1, -1, -1], [0, ... -1], [0, -1, -1]] type='550 * var * int64'>

In [26]:
enFraction = events['enFraction'].array()
enFraction[0]

<Array [[1, -1, -1], [1, ... -1], [1, -1, -1]] type='550 * var * float64'>