In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import uproot

In [2]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [3]:
path_to_data = '../data/aods/'

# Load a ROOT file
folder = 'data18_13TeV.00364292.calibration_DataScouting_05_Jets.deriv.DAOD_TRIG6.r10657_p3592_p3754/'
fname = 'DAOD_TRIG6.16825104._000035.pool.root.1'
filePath = path_to_data + folder + fname
#ttree = uproot.open(filePath)['outTree']['nominal']
tree = uproot.open(filePath)['CollectionTree']

In [4]:
tree.keys()

[b'ByteStreamEventInfo',
 b'TrigConfKeys',
 b'EventInfoAux.',
 b'xTrigDecisionAux.',
 b'EventInfo',
 b'xTrigDecision',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAux.',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollection',
 b'EventInfoAuxDyn.streamTagRobs',
 b'EventInfoAuxDyn.streamTagDets',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.eta',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.phi',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.m',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentLinks',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentWeights',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ConstituentScale',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum_pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum

In [5]:
tree.array('HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.NegativeE').shape

(2260895,)

In [6]:
branchnames = [
    # 4-momentum
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.eta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.phi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.m',
    # Energy deposition in each calorimeter layer
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EnergyPerSampling',
    # Area of jet,used for pile-up suppression (4-vector)
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_eta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_m',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_phi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_pt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetGhostArea',
    # Variables related to quality of jet
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.AverageLArQF',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.BchCorrCell',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.NegativeE',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.HECQuality',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LArQuality',
    # Shape and position, most energetic cluster
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.Width',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.WidthPhi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.CentroidR',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.DetectorEta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterCenterLambda',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterPt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterSecondLambda',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterSecondR',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.N90Constituents',
    # Energy released in each calorimeter
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EMFrac',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.HECFrac',
    # Variables related to the time of arrival of a jet
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.Timing',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.OotFracClusters10',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.OotFracClusters5',
]

In [7]:
len(branchnames)

30

In [8]:
branchnames[4]

'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EnergyPerSampling'

In [9]:
EnergyPerSampling = tree.array(branchnames[4])
n_events = len(EnergyPerSampling)
n_events

2260895

In [10]:
arr = np.zeros((len(EnergyPerSampling), 28))
for ii, samp in enumerate(EnergyPerSampling):
    layers = np.array(samp[0])
    arr[ii, :] = layers
    if ii % 300000 == 0:
        print(str((100 * ii) // n_events) + '%')
print('100%')
arr.shape

0%
13%
26%
39%
53%
66%
79%
92%
100%


(2260895, 28)

In [11]:
e_samp_df = pd.DataFrame(data=arr, columns=['EnergyPerSampling%d' % kk for kk in np.arange(28)])
e_samp_df

Unnamed: 0,EnergyPerSampling0,EnergyPerSampling1,EnergyPerSampling2,EnergyPerSampling3,EnergyPerSampling4,EnergyPerSampling5,EnergyPerSampling6,EnergyPerSampling7,EnergyPerSampling8,EnergyPerSampling9,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,4463.817383,23248.812500,32758.755859,175.359970,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,700.465637,2787.192871,39659.492188,3555.119873,1273.725708,1467.259521,1702.037476,25.156708,0.000000,0.000000,...,57268.914062,39543.589844,1214.614746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4481.879883,17487.234375,31605.128906,819.519958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,1268.775146,25783.349609,123290.929688,11596.035156,52947.421875,38970.355469,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,26491.923828,142611.765625,23353.757812,59680.585938,26988.935547,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260890,298.998657,322.053650,174.562683,0.000000,7434.597656,36035.804688,94699.312500,6494.367676,7786.124023,17215.060547,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2260891,14950.682617,16882.427734,55113.578125,15053.662109,204.860580,518.438904,14.297218,0.000000,0.000000,0.000000,...,8076.918457,1731.564697,306.276306,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2260892,5381.773926,13260.756836,48842.007812,2307.945801,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-113.000000,224.189087,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2260893,12455.039062,20173.287109,84696.546875,2507.416504,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,3838.447754,3840.161621,913.936890,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
def get_leading(jaggedX):
    return jaggedX[jaggedX.counts > 0, 0]

In [13]:
df_dict = {}
for ii, branchname in enumerate(branchnames):
    variable = branchname.split('.')[1]
    if branchname == 'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EnergyPerSampling':
        pass
    else:
        jaggedX = tree.array(branchname)
        leadingX = get_leading(jaggedX)
        df_dict.update({variable: leadingX})
    if ii % 7 == 0:
        print(str((ii * 100) // len(branchnames)) + '%')
print('100%')
partial_df = pd.DataFrame(data=df_dict)

0%
23%
46%
70%
93%
100%


In [14]:
partial_df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,JetGhostArea,...,LeadingClusterCenterLambda,LeadingClusterPt,LeadingClusterSecondLambda,LeadingClusterSecondR,N90Constituents,EMFrac,HECFrac,Timing,OotFracClusters10,OotFracClusters5
0,90306.070312,0.308789,-1.921283,6840.930176,0.498666,0.318246,0.140055,-1.924426,0.489299,0.01,...,178.461990,60102.636719,15984.612305,13437.013672,3.0,0.888970,0.000000,1.078428,0.004533,0.117972
1,123364.226562,-1.344482,1.154525,10920.023438,0.488692,-1.343978,0.136743,1.147211,0.479669,0.01,...,5170.814453,44042.378906,165553.234375,76716.382812,3.0,0.324276,0.000000,0.707971,0.019895,0.026191
2,78104.867188,0.472603,-2.727408,7270.276855,0.478719,0.468219,0.133986,-2.730499,0.470848,0.01,...,269.738586,28077.353516,108412.906250,10236.585938,4.0,0.905376,0.000000,0.363695,0.006068,0.006068
3,110898.640625,1.884189,-2.434126,13122.060547,0.528585,1.914493,0.151015,-2.437690,0.516863,0.01,...,627.649658,16306.850586,139001.937500,2557.068359,11.0,0.587340,0.412595,0.525843,0.006622,0.023643
4,101825.992188,-2.036684,-3.113684,8041.627441,0.478719,-2.056497,0.132225,-3.115452,0.469632,0.01,...,869.004639,36063.585938,166999.343750,17401.626953,5.0,0.663741,0.335038,-0.329519,0.011073,0.011073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260890,91249.914062,-1.745303,-2.623895,6592.313965,0.468746,-1.774570,0.125520,-2.642066,0.459769,0.01,...,339.393738,33378.828125,166918.890625,3192.991211,3.0,0.771015,0.228985,-1.044461,0.004217,0.004217
2260891,101260.921875,-1.253928,-2.815772,9903.192383,0.528585,-1.253075,0.151641,-2.804190,0.518223,0.01,...,263.608734,30561.958984,29329.003906,2992.535400,5.0,0.865037,0.000000,0.036002,0.006210,0.006210
2260892,81322.601562,0.825275,-2.666970,9852.224609,0.478719,0.857738,0.134604,-2.664757,0.469947,0.01,...,243.027466,45031.343750,14059.222656,5731.004883,3.0,0.974370,0.000000,-0.120335,0.016170,0.023511
2260893,112891.523438,1.165162,1.432389,8749.023438,0.498666,1.187786,0.141278,1.419347,0.489429,0.01,...,257.263977,64777.597656,25375.058594,3136.577393,3.0,0.850187,0.000000,-0.226453,0.030247,0.030247


In [15]:
full_df = partial_df.join(e_samp_df)

In [16]:
full_df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,JetGhostArea,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,90306.070312,0.308789,-1.921283,6840.930176,0.498666,0.318246,0.140055,-1.924426,0.489299,0.01,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,123364.226562,-1.344482,1.154525,10920.023438,0.488692,-1.343978,0.136743,1.147211,0.479669,0.01,...,57268.914062,39543.589844,1214.614746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,78104.867188,0.472603,-2.727408,7270.276855,0.478719,0.468219,0.133986,-2.730499,0.470848,0.01,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110898.640625,1.884189,-2.434126,13122.060547,0.528585,1.914493,0.151015,-2.437690,0.516863,0.01,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,101825.992188,-2.036684,-3.113684,8041.627441,0.478719,-2.056497,0.132225,-3.115452,0.469632,0.01,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260890,91249.914062,-1.745303,-2.623895,6592.313965,0.468746,-1.774570,0.125520,-2.642066,0.459769,0.01,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2260891,101260.921875,-1.253928,-2.815772,9903.192383,0.528585,-1.253075,0.151641,-2.804190,0.518223,0.01,...,8076.918457,1731.564697,306.276306,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2260892,81322.601562,0.825275,-2.666970,9852.224609,0.478719,0.857738,0.134604,-2.664757,0.469947,0.01,...,0.000000,-113.000000,224.189087,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2260893,112891.523438,1.165162,1.432389,8749.023438,0.498666,1.187786,0.141278,1.419347,0.489429,0.01,...,3838.447754,3840.161621,913.936890,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
partial_train, partial_test = train_test_split(partial_df, test_size=0.1, random_state=41)

In [18]:
full_train, full_test = train_test_split(full_df, test_size=0.1, random_state=41)

In [19]:
print(partial_train.shape, partial_test.shape)

(2034805, 29) (226090, 29)


In [20]:
print(full_train.shape, full_test.shape)

(2034805, 57) (226090, 57)


In [22]:
# Save train and test sets
#partial_train.to_pickle('processed_data/aod/partial_train.pkl')
#partial_test.to_pickle('processed_data/aod/partial_test.pkl')
#full_train.to_pickle('processed_data/aod/full_train.pkl')
#full_test.to_pickle('processed_data/aod/full_test.pkl')