In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import uproot

In [2]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [3]:
path_to_data = '../data/aods/'

# Load a ROOT file
folder = 'data18_13TeV.00364292.calibration_DataScouting_05_Jets.deriv.DAOD_TRIG6.r10657_p3592_p3754/'
fname = 'DAOD_TRIG6.16825104._000035.pool.root.1'
filePath = path_to_data + folder + fname
#ttree = uproot.open(filePath)['outTree']['nominal']
tree = uproot.open(filePath)['CollectionTree']

In [4]:
tree.keys()

[b'ByteStreamEventInfo',
 b'TrigConfKeys',
 b'EventInfoAux.',
 b'xTrigDecisionAux.',
 b'EventInfo',
 b'xTrigDecision',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAux.',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollection',
 b'EventInfoAuxDyn.streamTagRobs',
 b'EventInfoAuxDyn.streamTagDets',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.eta',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.phi',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.m',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentLinks',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentWeights',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ConstituentScale',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum_pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum

In [5]:
tree.array('HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt').shape

(2260895,)

In [6]:
n_jets = sum(tree.array('HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt').counts)

In [7]:
n_jets

11951922

In [8]:
branchnames = [
    # 4-momentum
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.eta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.phi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.m',
    # Energy deposition in each calorimeter layer
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EnergyPerSampling',
    # Area of jet,used for pile-up suppression (4-vector)
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_eta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_m',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_phi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_pt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetGhostArea',
    # Variables related to quality of jet
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.AverageLArQF',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.BchCorrCell',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.NegativeE',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.HECQuality',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LArQuality',
    # Shape and position, most energetic cluster
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.Width',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.WidthPhi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.CentroidR',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.DetectorEta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterCenterLambda',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterPt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterSecondLambda',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterSecondR',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.N90Constituents',
    # Energy released in each calorimeter
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EMFrac',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.HECFrac',
    # Variables related to the time of arrival of a jet
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.Timing',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.OotFracClusters10',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.OotFracClusters5',
]

In [9]:
len(branchnames)

30

In [10]:
EnergyPerSampling = tree.array(branchnames[4])
n_events = len(EnergyPerSampling)

In [11]:
n_events

2260895

In [12]:
arr = -np.ones(shape=(n_jets, 28))
curr_i = 0
for ii, layers in enumerate(EnergyPerSampling):
    layers = np.array(layers)
    n_jets_curr = layers.shape[0]
    arr[curr_i:curr_i + n_jets_curr, :] = layers
    curr_i = curr_i + n_jets_curr
    if ii % 300000 == 0:
        print(str((ii * 100) // len(EnergyPerSampling)) + '%')
print('100%')
arr.shape

0%
13%
26%
39%
53%
66%
79%
92%
100%


(11951922, 28)

In [13]:
del EnergyPerSampling  # Free up memory

In [14]:
e_samp_df = pd.DataFrame(data=arr, columns=['EnergyPerSampling%d' % kk for kk in np.arange(28)])

In [15]:
del arr

In [16]:
e_samp_df

Unnamed: 0,EnergyPerSampling0,EnergyPerSampling1,EnergyPerSampling2,EnergyPerSampling3,EnergyPerSampling4,EnergyPerSampling5,EnergyPerSampling6,EnergyPerSampling7,EnergyPerSampling8,EnergyPerSampling9,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,4463.817383,23248.812500,32758.755859,175.359970,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,5393.272461,396.338531,124.353470,0.000000,1352.804565,12997.504883,29439.746094,1339.140869,574.071777,475.165771,...,3180.364258,-56.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,646.858521,913.609863,6966.411621,491.769287,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,148891.0,-1982.236328,0.0,0.0,0.0,0.0,0.0
4,700.465637,2787.192871,39659.492188,3555.119873,1273.725708,1467.259521,1702.037476,25.156708,0.000000,0.000000,...,57268.914062,39543.589844,1214.614746,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11951917,4110.071289,16758.707031,35582.375000,1644.797485,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951918,7386.522949,1641.392212,5150.359863,-16.984421,8081.199707,19011.074219,42899.199219,4962.972656,12542.588867,1560.480591,...,10682.195312,6861.119141,53.500000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951919,6112.090820,14053.548828,43245.312500,3891.802734,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951920,3078.421631,14908.059570,22810.492188,1308.642822,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [17]:
df_dict = {}
for pp, branchname in enumerate(branchnames):
    if branchname == 'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EnergyPerSampling':
        pass
    else:
        variable = branchname.split('.')[1]
        df_dict[variable] = []
        jaggedX = tree.array(branchname)
        for ii, arr in enumerate(jaggedX):
            for kk, val in enumerate(arr):
                df_dict[variable].append(val)
    if pp % 3 == 0:
        print((pp * 100) // len(branchnames), '%')
print('100%')
print('Creating DataFrame...')
partial_df = pd.DataFrame(data=df_dict)
print('done.')

0 %
10 %
20 %
30 %
40 %
50 %
60 %
70 %
80 %
90 %
100%
Creating DataFrame...
done.


In [18]:
del df_dict

In [19]:
partial_df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,JetGhostArea,...,LeadingClusterCenterLambda,LeadingClusterPt,LeadingClusterSecondLambda,LeadingClusterSecondR,N90Constituents,EMFrac,HECFrac,Timing,OotFracClusters10,OotFracClusters5
0,90306.070312,0.308789,-1.921283,6840.930176,0.498666,0.318246,0.140055,-1.924426,0.489299,0.01,...,178.461990,60102.636719,15984.612305,13437.013672,3.0,0.888970,0.000000,1.078428,0.004533,0.117972
1,38400.785156,1.652859,1.518780,6869.742676,0.528585,1.665649,0.153715,1.519727,0.517078,0.01,...,783.411194,7884.223633,4201.925293,18003.751953,12.0,0.863779,0.017756,-0.063829,0.045438,0.045438
2,23870.822266,-0.104421,0.650640,4165.587891,0.468746,-0.099568,0.127942,0.659688,0.460727,0.01,...,247.017471,8997.914062,8866.488281,7871.465332,5.0,0.696391,0.000000,0.870779,0.075354,0.075354
3,20065.375000,3.540296,-0.125675,0.000000,0.468746,3.598612,0.128710,-0.120809,0.460141,0.01,...,224.390015,9869.475586,481.461517,388.865387,1.0,1.013493,0.000000,0.162639,0.000000,0.000000
4,123364.226562,-1.344482,1.154525,10920.023438,0.488692,-1.343978,0.136743,1.147211,0.479669,0.01,...,5170.814453,44042.378906,165553.234375,76716.382812,3.0,0.324276,0.000000,0.707971,0.019895,0.026191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11951917,114079.367188,-0.394999,-2.638226,16422.587891,0.518612,-0.406506,0.149505,-2.651042,0.507893,0.01,...,372.346924,40667.167969,191986.656250,9667.271484,6.0,0.705080,0.000000,0.798371,0.000000,0.003736
11951918,99418.195312,1.514059,1.055971,14655.088867,0.528585,1.521448,0.152478,1.062383,0.517441,0.01,...,73.819290,34442.792969,122756.203125,18290.041016,6.0,0.659999,0.100663,-0.496756,0.002235,0.019293
11951919,96017.484375,0.754035,1.596357,6563.273926,0.528585,0.755031,0.153312,1.598242,0.517397,0.01,...,229.239685,45135.984375,16261.896484,8211.402344,3.0,0.840508,0.000000,1.625721,0.022706,0.206683
11951920,65371.445312,-0.684227,1.889014,7944.513672,0.488692,-0.693882,0.137595,1.888648,0.479640,0.01,...,187.929688,24728.742188,18120.142578,4310.512695,6.0,0.800413,0.000000,-0.010618,0.036225,0.053988


In [20]:
full_df = partial_df.join(e_samp_df)

In [21]:
del e_samp_df

In [22]:
full_df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,JetGhostArea,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,90306.070312,0.308789,-1.921283,6840.930176,0.498666,0.318246,0.140055,-1.924426,0.489299,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,38400.785156,1.652859,1.518780,6869.742676,0.528585,1.665649,0.153715,1.519727,0.517078,0.01,...,3180.364258,-56.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,23870.822266,-0.104421,0.650640,4165.587891,0.468746,-0.099568,0.127942,0.659688,0.460727,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,20065.375000,3.540296,-0.125675,0.000000,0.468746,3.598612,0.128710,-0.120809,0.460141,0.01,...,0.000000,0.000000,0.000000,148891.0,-1982.236328,0.0,0.0,0.0,0.0,0.0
4,123364.226562,-1.344482,1.154525,10920.023438,0.488692,-1.343978,0.136743,1.147211,0.479669,0.01,...,57268.914062,39543.589844,1214.614746,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11951917,114079.367188,-0.394999,-2.638226,16422.587891,0.518612,-0.406506,0.149505,-2.651042,0.507893,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951918,99418.195312,1.514059,1.055971,14655.088867,0.528585,1.521448,0.152478,1.062383,0.517441,0.01,...,10682.195312,6861.119141,53.500000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951919,96017.484375,0.754035,1.596357,6563.273926,0.528585,0.755031,0.153312,1.598242,0.517397,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951920,65371.445312,-0.684227,1.889014,7944.513672,0.488692,-0.693882,0.137595,1.888648,0.479640,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [23]:
partial_train, partial_test = train_test_split(partial_df, test_size=0.1, random_state=41)
full_train, full_test = train_test_split(full_df, test_size=0.1, random_state=41)
print(partial_train.shape, partial_test.shape)
print(full_train.shape, full_test.shape)

(10756729, 29) (1195193, 29)
(10756729, 57) (1195193, 57)


In [24]:
# Save train and test sets
#partial_train.to_pickle('processed_data/aod/all_jets_partial_train.pkl')
#partial_test.to_pickle('processed_data/aod/all_jets_partial_test.pkl')
#full_train.to_pickle('processed_data/aod/all_jets_full_train.pkl')
#full_test.to_pickle('processed_data/aod/all_jets_full_test.pkl')
# train = pd.read_pickle('processed_data/train.pkl')
# test = pd.read_pickle('processed_data/test.pkl')