In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import uproot

In [2]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [3]:
path_to_data = '../data/aods/'

# Load a ROOT file
folder = 'data18_13TeV.00364292.calibration_DataScouting_05_Jets.deriv.DAOD_TRIG6.r10657_p3592_p3754/'
fname = 'DAOD_TRIG6.16825104._000035.pool.root.1'
filePath = path_to_data + folder + fname
#ttree = uproot.open(filePath)['outTree']['nominal']
tree = uproot.open(filePath)['CollectionTree']

In [4]:
tree.keys()

[b'ByteStreamEventInfo',
 b'TrigConfKeys',
 b'EventInfoAux.',
 b'xTrigDecisionAux.',
 b'EventInfo',
 b'xTrigDecision',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAux.',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollection',
 b'EventInfoAuxDyn.streamTagRobs',
 b'EventInfoAuxDyn.streamTagDets',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.eta',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.phi',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.m',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentLinks',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentWeights',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ConstituentScale',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum_pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum

In [5]:
tree.array('HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt').shape

(2260895,)

In [6]:
branchnames = [
    # 4-momentum
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.eta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.phi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.m',
    # Energy deposition in each calorimeter layer
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EnergyPerSampling',
    # Area of jet,used for pile-up suppression (4-vector)
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_eta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_m',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_phi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ActiveArea4vec_pt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetGhostArea',
    # Variables related to quality of jet
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.AverageLArQF',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.BchCorrCell',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.NegativeE',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.HECQuality',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LArQuality',
    # Shape and position, most energetic cluster
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.Width',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.WidthPhi',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.CentroidR',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.DetectorEta',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterCenterLambda',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterPt',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterSecondLambda',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.LeadingClusterSecondR',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.N90Constituents',
    # Energy released in each calorimeter
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EMFrac',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.HECFrac',
    # Variables related to the time of arrival of a jet
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.Timing',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.OotFracClusters10',
    'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.OotFracClusters5',
]

In [7]:
len(branchnames)

30

In [17]:
EnergyPerSampling = tree.array(branchnames[4])
n_events = len(EnergyPerSampling)
counts = EnergyPerSampling.counts
n_events

2260895

In [24]:
len(EnergyPerSampling[0])

4

In [39]:
arr = -np.ones(shape=(1, 28))
for ii, layers in enumerate(EnergyPerSampling):
    tmp = np.array(layers)
    arr = np.concatenate((arr, tmp))
    if ii % 300000 == 0:
        print(str((ii * 100) // len(EnergyPerSampling)) + '%')
print('100%')
arr = arr[1:, :]
arr.shape

0%


KeyboardInterrupt: 

In [38]:
e_samp_df2 = pd.DataFrame(data=arr, columns=['EnergyPerSampling%d' % kk for kk in np.arange(28)])
e_samp_df2.head()

Unnamed: 0,EnergyPerSampling0,EnergyPerSampling1,EnergyPerSampling2,EnergyPerSampling3,EnergyPerSampling4,EnergyPerSampling5,EnergyPerSampling6,EnergyPerSampling7,EnergyPerSampling8,EnergyPerSampling9,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,60041.632812,45780.460938,60179.554688,464.675781,265.497253,15695.404297,59999.566406,730.045166,2815.054443,-247.458008,...,51526.328125,7077.004395,-7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4143.124023,19081.357422,63302.40625,4952.949707,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7071.717773,1991.560059,3096.794922,-8.109276,-200.751373,33.934982,358.383514,0.0,0.0,0.0,...,702.696899,2830.165527,223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,13921.581055,34272.847656,11452.476562,10521.869141,17.560242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
e_samp_df = pd.DataFrame(data=arr1, columns=['EnergyPerSampling%d' % kk for kk in np.arange(28)])
e_samp_df.head()

Unnamed: 0,EnergyPerSampling0,EnergyPerSampling1,EnergyPerSampling2,EnergyPerSampling3,EnergyPerSampling4,EnergyPerSampling5,EnergyPerSampling6,EnergyPerSampling7,EnergyPerSampling8,EnergyPerSampling9,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,60041.632812,45780.460938,60179.554688,464.675781,265.497253,15695.404297,59999.566406,730.045166,2815.054443,-247.458008,...,51526.328125,7077.004395,-7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4143.124023,19081.357422,63302.40625,4952.949707,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7071.717773,1991.560059,3096.794922,-8.109276,-200.751373,33.934982,358.383514,0.0,0.0,0.0,...,702.696899,2830.165527,223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,13921.581055,34272.847656,11452.476562,10521.869141,17.560242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,63568.835938,19498.246094,35779.4375,28265.224609,...,0.0,0.0,0.0,-153.426895,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
def get_leading(jaggedX):
    return jaggedX[jaggedX.counts > 0, 0]

In [61]:
df_dict = {}
for pp, branchname in enumerate(branchnames):
    if branchname == 'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.EnergyPerSampling':
        pass
    else:
        variable = branchname.split('.')[1]
        df_dict[variable] = []
        jaggedX = tree.array(branchname)
        for ii, arr in enumerate(jaggedX):
            for kk, val in enumerate(arr):
                df_dict[variable].append(val)
    if pp % 8 == 0:
        print((pp * 100) // len(branchnames), '%')
print('100%')
df = pd.DataFrame(data=df_dict)

0 %
26 %
53 %
80 %
100%


In [62]:
df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,JetGhostArea,...,LeadingClusterCenterLambda,LeadingClusterPt,LeadingClusterSecondLambda,LeadingClusterSecondR,N90Constituents,EMFrac,HECFrac,Timing,OotFracClusters10,OotFracClusters5
0,228545.531250,-1.438315,2.711860,23883.824219,0.478719,-1.456703,0.134253,2.713455,0.468385,0.01,...,1252.707642,40018.570312,477528.312500,60510.593750,8.0,0.702820,0.007421,-0.236629,0.000762,0.006073
1,198106.546875,-0.439318,-0.421546,18192.466797,0.518612,-0.442390,0.149487,-0.441860,0.508819,0.01,...,549.581177,123063.882812,220408.500000,30249.955078,3.0,0.595072,0.000000,1.847262,0.005879,0.027711
2,23818.916016,-1.322502,1.463485,4526.073730,0.508639,-1.312496,0.141916,1.468495,0.498780,0.01,...,63.709171,6762.835938,57776.781250,8843.713867,7.0,0.532229,0.000000,-10.493912,0.269108,0.269108
3,22430.296875,2.188443,-1.832321,4748.524414,0.468746,2.185287,0.127859,-1.825896,0.459480,0.01,...,428.162018,5631.054688,40596.738281,3203.826172,10.0,0.849836,0.150164,1.192391,0.000000,0.116424
4,22399.556641,2.959124,-2.578426,4280.812988,0.548532,2.945786,0.160414,-2.583863,0.536613,0.01,...,678.729553,8764.081055,16676.707031,12093.711914,4.0,0.554876,0.445124,-2.428177,0.010300,0.504123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139718,23053.271484,-0.831095,2.690083,6197.927734,0.568479,-0.778817,0.172646,2.664963,0.556712,0.01,...,100.431648,5807.024414,16544.058594,5922.724609,13.0,0.703291,0.000000,-0.640880,0.307621,0.307621
139719,22777.830078,-1.317536,-2.151809,5117.765137,0.528585,-1.320784,0.151807,-2.155239,0.517295,0.01,...,30.502775,4636.035156,30781.945312,11139.019531,10.0,0.787942,0.055297,-7.067620,0.457355,0.584896
139720,22062.843750,-0.765144,-1.261964,5821.605469,0.578452,-0.779594,0.174805,-1.257300,0.565945,0.01,...,1406.793457,5279.791016,117778.898438,36705.746094,13.0,0.677984,0.000000,-4.346131,0.345292,0.402175
139721,20447.623047,0.802867,-1.422507,5019.024414,0.508639,0.784966,0.149479,-1.420204,0.498070,0.01,...,122.402893,5611.245605,26428.572266,6384.049316,8.0,0.801634,0.000000,-2.179024,0.253344,0.344438


In [63]:
df = df.join(e_samp_df)

In [64]:
df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,JetGhostArea,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,228545.531250,-1.438315,2.711860,23883.824219,0.478719,-1.456703,0.134253,2.713455,0.468385,0.01,...,51526.328125,7077.004395,-7.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,198106.546875,-0.439318,-0.421546,18192.466797,0.518612,-0.442390,0.149487,-0.441860,0.508819,0.01,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,23818.916016,-1.322502,1.463485,4526.073730,0.508639,-1.312496,0.141916,1.468495,0.498780,0.01,...,702.696899,2830.165527,223.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,22430.296875,2.188443,-1.832321,4748.524414,0.468746,2.185287,0.127859,-1.825896,0.459480,0.01,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,22399.556641,2.959124,-2.578426,4280.812988,0.548532,2.945786,0.160414,-2.583863,0.536613,0.01,...,0.000000,0.000000,0.0,-153.426895,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139718,23053.271484,-0.831095,2.690083,6197.927734,0.568479,-0.778817,0.172646,2.664963,0.556712,0.01,...,0.000000,198.500000,682.5,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
139719,22777.830078,-1.317536,-2.151809,5117.765137,0.528585,-1.320784,0.151807,-2.155239,0.517295,0.01,...,1108.077271,-43.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
139720,22062.843750,-0.765144,-1.261964,5821.605469,0.578452,-0.779594,0.174805,-1.257300,0.565945,0.01,...,0.000000,0.000000,-49.5,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
139721,20447.623047,0.802867,-1.422507,5019.024414,0.508639,0.784966,0.149479,-1.420204,0.498070,0.01,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
train, test = train_test_split(df, test_size=0.2, random_state=41)

In [66]:
print(train.shape, test.shape)

(111778, 57) (27945, 57)


In [32]:
# Save train and test sets
#df1.to_pickle('processed_data/dijetISRphoton36k_jets.pkl')
#df2.to_pickle('processed_data/dijetISRphoton36k_photons.pkl')
# train = pd.read_pickle('processed_data/train.pkl')
# test = pd.read_pickle('processed_data/test.pkl')