# Format PFN inputs

In [1]:
import uproot as ur
import awkward as ak
import numpy as np

## Read dataset trees for pi0 and pipm

In [2]:
data_path = '/fast_scratch/atlas_images/v01-45/'
!ls -a /fast_scratch/atlas_images/v01-45/
#X_4.npy is a 180k long array of clusters; first 90k are Pi+ and last 90k are pi0

.      XY_tracks_small.npz  pi0		     pi0_small.root   pipm_medium.root
..     X_4.npy		    pi0_medium.npz   pipm	      pipm_small.root
X.npy  delta		    pi0_medium.root  pipm_medium.npz  rho


In [3]:
path_pipm = data_path + 'pipm_medium.root'
path_pi0  = data_path + 'pi0_medium.root'

In [4]:
tree_pipm = ur.open(path_pipm)['EventTree']
tree_pi0  = ur.open(path_pi0)['EventTree']

In [5]:
geoDict_tree_pipm = ur.open(path_pipm)['CellGeo']
addgeoDict_tree_pi0  = ur.open(path_pi0)["CellGeo"]

## Import useful functions

In [6]:
# my worspace and directories
path_prefix = '/home/dportill/LCStudies/'
plotpath = path_prefix+'classifier/Plots_EFN/'
modelpath = path_prefix+'classifier/Models_EFN/'

In [7]:
# import graph utilities
import sys
sys.path.append(path_prefix)
from  util import graph_util as gu
from  util import plot_util as pu

## Process the inputs 

* loadArrayBranchFlat(branchName, tree, padLength): Gives a flat list of clusters padded with equal number of cells

ak.flatten: flatten the event index, to generate a list of clusters

pad_none: pad the cell axis to the specified length of padLength

In [8]:
#pipm
cell_id_pipm = gu.loadArrayBranchFlat('cluster_cell_ID', tree_pipm, 2000)
cell_e_pipm = gu.loadArrayBranchFlat('cluster_cell_E', tree_pipm, 2000)

In [9]:
#pi0
cell_id_pi0 = gu.loadArrayBranchFlat('cluster_cell_ID', tree_pi0, 2000)
cell_e_pi0 = gu.loadArrayBranchFlat('cluster_cell_E', tree_pi0, 2000)

In [10]:
print("Number of pipm clusters:", len(cell_e_pipm)) 
print("Number of pi0 clusters:", len(cell_e_pi0)) 
print("Number of padded cells has to be equal to padLength:", len(cell_e_pipm[0])) 

Number of pipm clusters: 672281
Number of pi0 clusters: 421481
Number of padded cells has to be equal to padLength: 2000


* loadGraphDictionary(tree): converting the geoTree into a dictionary for each branch that has each one cell_geo_ID as key 

? I dont understand the "mask" part :S

In [11]:
#load dictionaries 
geoDict_pipm = gu.loadGraphDictionary(geoDict_tree_pipm)
geoDict_pi0  = gu.loadGraphDictionary(geoDict_tree_pi0)

In [12]:
geoDict_pipm.keys()

dict_keys(['cell_geo_sampling', 'cell_geo_eta', 'cell_geo_phi', 'cell_geo_rPerp', 'cell_geo_deta', 'cell_geo_dphi', 'cell_geo_volume', 'cell_geo_sigma'])

In [13]:
print("Sampling ID of the cell with ID 740294656:",geoDict_pipm['cell_geo_sampling'][740294656])

Sampling ID of the cell with ID 740294656: 6


* convertIDToGeo(cellID, geoString, globalDict)

Connects the cell ID from the event tree (cellID) that has all the cells in order with the cell ID on the geo tree (cluster_cell_ID) for a cell variable using the global dictionary


[Translate every element in numpy array according to key](https://stackoverflow.com/questions/16992713/translate-every-element-in-numpy-array-according-to-key)

np.vectorize(my_dict.get)(array)

In [14]:
cell_phi_pipm = gu.convertIDToGeo(cell_id_pipm, 'cell_geo_phi', geoDict_pipm)
cell_eta_pipm = gu.convertIDToGeo(cell_id_pipm, 'cell_geo_eta', geoDict_pipm)

cell_phi_pi0 = gu.convertIDToGeo(cell_id_pi0, 'cell_geo_phi', geoDict_pi0)
cell_eta_pi0 = gu.convertIDToGeo(cell_id_pi0, 'cell_geo_eta', geoDict_pi0)

* loadVectorBranchFlat(branchName, tree): 

 converts to numpy a flatten array for the desired branch (no event index). But in this case is not padded because it is at cluster level

In [15]:
clus_eta_pipm = gu.loadVectorBranchFlat('cluster_Eta', tree_pipm)
clus_E_pipm = gu.loadVectorBranchFlat('cluster_E', tree_pipm)
clus_phi_pipm = gu.loadVectorBranchFlat('cluster_Phi', tree_pipm)

clus_eta_pi0 = gu.loadVectorBranchFlat('cluster_Eta', tree_pi0)
clus_E_pi0 = gu.loadVectorBranchFlat('cluster_E', tree_pi0)
clus_phi_pi0  = gu.loadVectorBranchFlat('cluster_Phi', tree_pi0)

* Filter Events  $|\eta|<0.7$

In [16]:
#Central Clusters
eta_mask_pipm = abs(clus_eta_pipm) < 0.7
eta_mask_pi0  = abs(clus_eta_pi0) < 0.7

#High energy clusters
energy_mask_pipm = clus_E_pipm > 0.5
energy_mask_pi0 = clus_E_pi0 > 0.5

In [17]:
selection_pipm = eta_mask_pipm & energy_mask_pipm
selection_pi0 = eta_mask_pi0 & energy_mask_pi0

* Normalise eta, phi  ???

In [18]:
cell_eta_norm_pipm = np.nan_to_num(cell_eta_pipm - clus_eta_pipm[:, None])
cell_eta_norm_pi0 = np.nan_to_num(cell_eta_pi0 - clus_eta_pi0[:, None])
cell_phi_norm_pipm = np.nan_to_num(cell_phi_pipm - clus_phi_pipm[:, None])
cell_phi_norm_pi0 = np.nan_to_num(cell_phi_pi0 - clus_phi_pi0[:, None])

* Log(Energy)

In [19]:
cell_e_norm_pipm = np.nan_to_num(np.log(cell_e_pipm), posinf = 0, neginf=0)
cell_e_norm_pi0 = np.nan_to_num(np.log(cell_e_pi0), posinf = 0, neginf=0)

  cell_e_norm_pipm = np.nan_to_num(np.log(cell_e_pipm), posinf = 0, neginf=0)
  cell_e_norm_pi0 = np.nan_to_num(np.log(cell_e_pi0), posinf = 0, neginf=0)


* Normalise sampling ID ?

In [20]:
cell_samp_pipm = gu.convertIDToGeo(cell_id_pipm, 'cell_geo_sampling', geoDict_pipm)
cell_samp_pi0 = gu.convertIDToGeo(cell_id_pi0, 'cell_geo_sampling', geoDict_pi0)

In [21]:
#normalize to a lower value fairly arbitrarily
cell_samp_norm_pipm = cell_samp_pipm * 0.1
cell_samp_norm_pi0 = cell_samp_pi0 * 0.1

* Add other cell-level features: cell_geo_volume , cell_geo_sigma

In [22]:
cell_vol_pipm = gu.convertIDToGeo(cell_id_pipm, 'cell_geo_volume', geoDict_pipm)
cell_vol_pi0  = gu.convertIDToGeo(cell_id_pi0,  'cell_geo_volume', geoDict_pi0)

In [23]:
cell_sig_pipm = gu.convertIDToGeo(cell_id_pipm, 'cell_geo_sigma', geoDict_pipm)
cell_sig_pi0  = gu.convertIDToGeo(cell_id_pi0,  'cell_geo_sigma', geoDict_pi0)

In [24]:
cell_vol_norm_pipm = np.nan_to_num(cell_vol_pipm)
cell_vol_norm_pi0  = np.nan_to_num(cell_vol_pi0)

cell_sig_norm_pipm = np.nan_to_num(cell_sig_pipm)
cell_sig_norm_pi0  = np.nan_to_num(cell_sig_pi0)

## Energy Flow inputs 

Inputs consists of two components:
- `X` : has X_pipm first and then X_pi0. It has shape `(num_clusters,max_num_cells,4)`  The cells are given as `(log(E),eta,phi,samplerID)` values
- `Y` : a numpy array of quark/gluon jet labels (pipm=`1` and pi0=`0`). 

In [25]:
#X_pipm = np.stack((cell_e_norm_pipm[selection_pipm], cell_eta_norm_pipm[selection_pipm], cell_phi_norm_pipm[selection_pipm], cell_samp_norm_pipm[selection_pipm]), axis = 2)
#X_pi0  = np.stack((cell_e_norm_pi0[selection_pi0], cell_eta_norm_pi0[selection_pi0], cell_phi_norm_pi0[selection_pi0], cell_samp_norm_pi0[selection_pi0]), axis = 2)

In [26]:
#X = np.concatenate((X_pipm[:90000], X_pi0[:90000]))

In [27]:
#np.save(path_prefix+'X_baseline.npy', X)

In [28]:
X_pipm = np.stack((cell_e_norm_pipm[selection_pipm], cell_eta_norm_pipm[selection_pipm], cell_phi_norm_pipm[selection_pipm], cell_samp_norm_pipm[selection_pipm], cell_sig_norm_pipm[selection_pipm], cell_vol_norm_pipm[selection_pipm]), axis = 2)
X_pi0  = np.stack((cell_e_norm_pi0[selection_pi0], cell_eta_norm_pi0[selection_pi0], cell_phi_norm_pi0[selection_pi0], cell_samp_norm_pi0[selection_pi0], cell_sig_norm_pi0[selection_pi0], cell_vol_norm_pi0[selection_pi0]), axis = 2)

In [29]:
X = np.concatenate((X_pipm[:90000], X_pi0[:90000]))

In [32]:
np.save(path_prefix+'X_added.npy', X)