In [1]:
#!pip install numpy uproot

In [2]:
import awkward as ak
import numpy as np
import uproot as uproot

In [3]:
file = uproot.open('merged.root')
file.keys()

['lCToTSTsAssoc;1', 'lCToTSTsAssoc/lCTo3simTS_tree;1']

In [4]:
events = file['lCToTSTsAssoc/lCTo3simTS_tree']
events.show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
run                  | int32_t                  | AsDtype('>i4')
event                | int32_t                  | AsDtype('>i4')
lumi                 | int32_t                  | AsDtype('>i4')
id                   | std::vector<int32_t>     | AsJagged(AsDtype('>i4'), he...
pos_x                | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
pos_y                | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
pos_z                | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
energy               | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
time                 | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
time_error           | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
eta                  | std::vector<float>       | AsJagged(AsDtype('>f4'), h

## Input Features

 - `event`: the index of the events in the file
 
Each event contains a varying number of point. For each point:
 - `pos_x`, `pos_y`, `pos_z`: (x,y,z) coordinates of the point
 - `time`: time of the point (not all points have timing information -- a value of -99 indicates no timing information)
 - `eta`, `phi`: [pseudorapidity](https://en.wikipedia.org/wiki/Pseudorapidity), azimuth angle of the point
 - `layer`: layer number of the point
 - `energy`: energy of the point

**[Note]** `simTst_idx` and `enFraction` are the truth labels.

These features are [JaggedArray](https://github.com/scikit-hep/awkward-0.x). This can be thought of as an array of arrays, where the first level of indices correspond to the event indices, and the second level correspond to the LCs within each event.

In [5]:
# To get the total number of events in the file
totEvents = len(events['event'].array())
totEvents

3800

In [6]:
# To get the information of all points from all events in the file
id = events['id'].array()
pos_x = events['pos_x'].array()
pos_y = events['pos_y'].array()
pos_z = events['pos_z'].array()
energy = events['energy'].array()
eta = events['eta'].array()
phi = events['phi'].array()


In [7]:
# To access energy of all points from the first event: this returns a regular numpy array
energy[0]

<Array [0.0447, 0.0599, ... 0.0501, 0.363] type='550 * float32'>

In [8]:
# To get the number of points in each event
nPoints_perEv = ak.num(energy)
nPoints_perEv

<Array [550, 562, 624, 440, ... 683, 505, 613] type='3800 * int64'>

In [9]:
# to pad/truncate to a regular 2D array
ak.pad_none(energy, 2, clip=True)

<Array [[0.0447, 0.0599], ... [0.0546, 0.0148]] type='3800 * 2 * ?float32'>

In [10]:
# To flatten the jagged array to a 1d array
ak.flatten(energy)

<Array [0.0447, 0.0599, ... 0.105, 0.0917] type='1864424 * float32'>

In [11]:
# To get the total number of points within the events
len(ak.flatten(energy))

1864424

### Truth definition

The target is to assign each point to up to 3 particle showers - aka Tracksters - sorted by the higher fraction of point's energy contained. The truth information is as follows:

 - `simTst_idx`: index of the (up to 3) showers associated with the point at hand; if the point is associated to less than 3 showers, the remaining array elements are -1;
 - `enFraction`: the fraction of the point's energy that should be assigned to each of the (up to 3) showers.

In [12]:
simTst_idx = events['simTst_idx'].array()
simTst_idx[0]

<Array [[1, -1, -1], [0, ... -1], [0, -1, -1]] type='550 * var * int64'>

In [13]:
enFraction = events['enFraction'].array()
enFraction[0]

<Array [[1, -1, -1], [1, ... -1], [1, -1, -1]] type='550 * var * float64'>

## Pattern Recognition

An example of pattern recognition is provided. 
In first approximation it looks like points belonging to the same particle have very similar eta and phi coordinates. 
Maybe we can just create a big 2D tiling structure, and cluster together all those points that lie in the same tile?

In [15]:
points = list()

for evt in range(totEvents):
    points.append(np.array(list(zip(id[evt],pos_x[evt],pos_y[evt],pos_z[evt],eta[evt],phi[evt],energy[evt]))))
    


In [16]:
from math import pi

# points never exceed 3.2 in eta
maxEta = 3.2
# points are never smaller than 1.5 in eta
minEta = 1.5
etaRange = maxEta - minEta
nEtaBins = 48
nPhiBins = 256
# let's linearize the 2-D tiling
nBins = nEtaBins * nPhiBins

etaBinSize = etaRange/nEtaBins
# phi goes from -pi to pi, so phiRange would be 2*pi
phiBinSize = 2*pi/nPhiBins

In [18]:
def clamp(num, min_value, max_value):
    """Clamps num between two boundaries, min_value and max_value."""
    return int(max(min(num, max_value), min_value))

def etaBin(eta):
    bin = (eta - minEta) / etaBinSize
    return clamp(bin,0,nEtaBins-1)
    

def phiBin (phi) : 
    bin =  (phi + pi) / phiBinSize
    return clamp(bin,0,nPhiBins-1)


def globalBin(eta, phi):
    return phiBin(phi) + etaBin(eta) * nPhiBins

output_idx  = list()
output_enFr = list()

for evt in range(totEvents):
    # the histogram contains a list of point ids per bin
    histogram = list()
    for i in range(nBins):
        histogram.append(list())
    tracksters = [[]]
    for p in points[evt] :
        # given eta and phi of a point, find the bin in the histogram 
        binId = globalBin(p[4],p[5])
        bin = histogram[binId]
        # append the id of the point to the list of the bin
        bin.append(int(p[0]))
    # now that we have filled the histogram,
    # check the bins that contain at least 5 points
    # and create a trackster with those ids
    for bin in histogram :
        if len(bin) >= 5:
            tmpTrackster = []
            for point_id in bin:
                tmpTrackster.append(point_id)
            tracksters.append(tmpTrackster)
    
    # create output lists 
    # for every point we want as a result the three top tracksters it contributed to
    output_idx_tmp = [[-1, -1, -1] for point in range(len(points[evt]))]
    # and the fraction of the energy that was contributed to it
    output_enFr_tmp = [[0.0, 0.0, 0.0] for point in range(len(points[evt]))]

    for tracksterId in range(len(tracksters)):
        # I found at most only one trackster per point, which will get fraction 1.0
        for lcId in tracksters[tracksterId]:
            output_idx_tmp[lcId][0] = tracksterId
            output_enFr_tmp[lcId][0] = 1.0
    # Adding my results to my global output for validation
    output_idx.append(output_idx_tmp)
    output_enFr.append(output_enFr_tmp)