In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import uproot

In [2]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [3]:
# Load MC ntuple
path_to_data = '../data/MC/DAOD_EXOT2/'
fname = 'mc16_13TeV.307791.MGPy8EG_N30LO_A14N23LO_DMsA_dijet_mR0p6_gSM0p05.deriv.DAOD_EXOT2.e5687_a875_r9364_p3654/DAOD_EXOT2.15738457._000001.pool.root'
# Load a ROOT file
filePath = path_to_data + fname
#ttree = uproot.open(filePath)['outTree']['nominal']
tree = uproot.open(filePath)['CollectionTree']

In [4]:
tree.keys()

[b'McEventInfo',
 b'TrigConfKeys',
 b'EventInfoAux.',
 b'xTrigDecisionAux.',
 b'EventInfo',
 b'xTrigDecision',
 b'Kt4EMTopoOriginEventShapeAux.',
 b'Kt4EMTopoOriginEventShape',
 b'PrimaryVertices',
 b'MET_Track',
 b'MuonSegments',
 b'LVL1JetRoIs',
 b'TruthEvents',
 b'BTagging_AntiKt4EMTopoAux.',
 b'HLT_xAOD__JetContainer_GSCJetAux.',
 b'HLT_xAOD__JetContainer_a4tcemsubjesISFSAux.',
 b'LVL1JetRoIsAux.',
 b'MET_TrackAux.',
 b'MuonSegmentsAux.',
 b'PrimaryVerticesAux.',
 b'TruthEventsAux.',
 b'TruthParticlesAux.',
 b'TruthParticles',
 b'AntiKt4EMTopoJets',
 b'AntiKt4TruthJets',
 b'HLT_xAOD__JetContainer_GSCJet',
 b'HLT_xAOD__JetContainer_a4tcemsubjesISFS',
 b'BTagging_AntiKt4EMTopo',
 b'AntiKt4EMTopoJetsAux.',
 b'AntiKt4TruthJetsAux.',
 b'AntiKt4EMTopoJetsAuxDyn.JetEMScaleMomentum_pt',
 b'AntiKt4EMTopoJetsAuxDyn.JetEMScaleMomentum_eta',
 b'AntiKt4EMTopoJetsAuxDyn.JetEMScaleMomentum_phi',
 b'AntiKt4EMTopoJetsAuxDyn.JetEMScaleMomentum_m',
 b'AntiKt4EMTopoJetsAuxDyn.JetConstitScaleMomentum_p

In [6]:
n_jets = sum(tree.array('HLT_xAOD__JetContainer_a4tcemsubjesISFSAuxDyn.pt').counts)

In [7]:
n_jets

331851

In [16]:
prefix = 'HLT_xAOD__JetContainer_GSCJetAuxDyn'
prefix = 'HLT_xAOD__JetContainer_a4tcemsubjesISFSAuxDyn'
branchnames = [
    # 4-momentum
    prefix + '.pt',
    prefix + '.eta',
    prefix + '.phi',
    prefix + '.m',
    # Energy deposition in each calorimeter layer
    # prefix + '.EnergyPerSampling',
    # Area of jet,used for pile-up suppression (4-vector)
    prefix + '.ActiveArea',
    prefix + '.ActiveArea4vec_eta',
    prefix + '.ActiveArea4vec_m',
    prefix + '.ActiveArea4vec_phi',
    prefix + '.ActiveArea4vec_pt',
    # prefix + '.JetGhostArea',
    # Variables related to quality of jet
    prefix + '.AverageLArQF',
    # prefix + '.BchCorrCell',
    prefix + '.NegativeE',
    prefix + '.HECQuality',
    prefix + '.LArQuality',
    # Shape and position, most energetic cluster
    # prefix + '.Width',
    # prefix + '.WidthPhi',
    prefix + '.CentroidR',
    prefix + '.DetectorEta',
    prefix + '.LeadingClusterCenterLambda',
    prefix + '.LeadingClusterPt',
    prefix + '.LeadingClusterSecondLambda',
    prefix + '.LeadingClusterSecondR',
    prefix + '.N90Constituents',
    # Energy released in each calorimeter
    prefix + '.EMFrac',
    prefix + '.HECFrac',
    # Variables related to the time of arrival of a jet
    prefix + '.Timing',
    prefix + '.OotFracClusters10',
    prefix + '.OotFracClusters5',
]

In [17]:
len(branchnames)

25

In [11]:
EnergyPerSampling = tree.array(branchnames[4])
n_events = len(EnergyPerSampling)

In [11]:
n_events

2260895

In [12]:
arr = -np.ones(shape=(n_jets, 28))
curr_i = 0
for ii, layers in enumerate(EnergyPerSampling):
    layers = np.array(layers)
    n_jets_curr = layers.shape[0]
    arr[curr_i:curr_i + n_jets_curr, :] = layers
    curr_i = curr_i + n_jets_curr
    if ii % 300000 == 0:
        print(str((ii * 100) // len(EnergyPerSampling)) + '%')
print('100%')
arr.shape

0%
13%
26%
39%
53%
66%
79%
92%
100%


(11951922, 28)

In [13]:
del EnergyPerSampling  # Free up memory

In [14]:
e_samp_df = pd.DataFrame(data=arr, columns=['EnergyPerSampling%d' % kk for kk in np.arange(28)])

In [15]:
del arr

In [16]:
e_samp_df

Unnamed: 0,EnergyPerSampling0,EnergyPerSampling1,EnergyPerSampling2,EnergyPerSampling3,EnergyPerSampling4,EnergyPerSampling5,EnergyPerSampling6,EnergyPerSampling7,EnergyPerSampling8,EnergyPerSampling9,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,4463.817383,23248.812500,32758.755859,175.359970,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,5393.272461,396.338531,124.353470,0.000000,1352.804565,12997.504883,29439.746094,1339.140869,574.071777,475.165771,...,3180.364258,-56.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,646.858521,913.609863,6966.411621,491.769287,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,148891.0,-1982.236328,0.0,0.0,0.0,0.0,0.0
4,700.465637,2787.192871,39659.492188,3555.119873,1273.725708,1467.259521,1702.037476,25.156708,0.000000,0.000000,...,57268.914062,39543.589844,1214.614746,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11951917,4110.071289,16758.707031,35582.375000,1644.797485,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951918,7386.522949,1641.392212,5150.359863,-16.984421,8081.199707,19011.074219,42899.199219,4962.972656,12542.588867,1560.480591,...,10682.195312,6861.119141,53.500000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951919,6112.090820,14053.548828,43245.312500,3891.802734,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951920,3078.421631,14908.059570,22810.492188,1308.642822,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [18]:
df_dict = {}
for pp, branchname in enumerate(branchnames):
    if 'EnergyPerSampling' in branchname:
        pass
    else:
        variable = branchname.split('.')[1]
        df_dict[variable] = []
        jaggedX = tree.array(branchname)
        for ii, arr in enumerate(jaggedX):
            for kk, val in enumerate(arr):
                df_dict[variable].append(val)
    if pp % 3 == 0:
        print((pp * 100) // len(branchnames), '%')
print('100%')
print('Creating DataFrame...')
partial_df = pd.DataFrame(data=df_dict)
print('done.')

0 %
12 %
24 %
36 %
48 %
60 %
72 %
84 %
96 %
100%
Creating DataFrame...
done.


In [18]:
del df_dict

In [19]:
partial_df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,AverageLArQF,...,LeadingClusterCenterLambda,LeadingClusterPt,LeadingClusterSecondLambda,LeadingClusterSecondR,N90Constituents,EMFrac,HECFrac,Timing,OotFracClusters10,OotFracClusters5
0,282143.906250,0.357106,2.052783,17815.330078,0.478719,0.387840,0.129280,2.034354,0.470565,43.899895,...,207.413467,184662.375000,13741.747070,4770.598145,3.0,0.791751,0.000000,-0.064632,0.004992,0.004992
1,268090.562500,1.286533,-1.037295,10843.919922,0.488692,1.316783,0.138009,-1.020096,0.479335,24.749546,...,348.538910,208915.296875,224177.031250,3444.697021,2.0,0.860485,0.000000,0.002606,0.000000,0.000000
2,18386.099609,4.203372,-0.821869,0.008074,0.538559,4.247578,0.152571,-0.836452,0.527718,730.156921,...,250.539703,20994.333984,6809.212891,1958.305054,1.0,1.015144,0.000000,-2.422421,0.000000,0.000000
3,26614.486328,1.413496,-2.393104,4689.978516,0.508639,1.365276,0.142169,-2.394876,0.498595,16.903849,...,226.911224,6079.817871,21261.664062,2442.945068,9.0,0.872794,0.082173,-0.170058,0.098625,0.098625
4,21262.230469,-0.447654,0.755479,5188.740234,0.558505,-0.492631,0.167659,0.790334,0.546490,81.829330,...,123.314041,3226.527832,19384.222656,6827.977539,10.0,0.875294,0.000000,0.009573,0.120956,0.168909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331846,10438.790039,-0.460442,1.021830,2340.786377,0.428852,-0.484422,0.114543,1.048390,0.421416,144.674011,...,267.776001,1762.479370,7203.634277,1960.649292,11.0,1.000000,0.000000,-4.382562,0.271414,0.271414
331847,13412.863281,0.178752,0.780650,2090.023926,0.339093,0.082899,0.082987,0.747995,0.332849,19.225246,...,225.900711,6032.415527,25921.080078,9205.497070,5.0,0.977427,0.000000,0.885477,0.000000,0.000000
331848,14762.843750,0.658686,-0.273598,3612.109619,0.349066,0.597585,0.089635,-0.243135,0.342878,36.474689,...,210.558228,3615.485596,38916.886719,7059.975586,8.0,0.850200,0.000000,0.023317,0.000000,0.000000
331849,3734.305176,3.817188,1.131771,628.705444,0.478719,3.771985,0.132169,1.124418,0.468688,315.682465,...,388.142609,8128.330078,57445.238281,1519.085205,2.0,0.713273,0.000000,5.877642,0.000000,0.805893


In [20]:
partial_df.columns

Index(['pt', 'eta', 'phi', 'm', 'ActiveArea', 'ActiveArea4vec_eta',
       'ActiveArea4vec_m', 'ActiveArea4vec_phi', 'ActiveArea4vec_pt',
       'AverageLArQF', 'NegativeE', 'HECQuality', 'LArQuality', 'CentroidR',
       'DetectorEta', 'LeadingClusterCenterLambda', 'LeadingClusterPt',
       'LeadingClusterSecondLambda', 'LeadingClusterSecondR',
       'N90Constituents', 'EMFrac', 'HECFrac', 'Timing', 'OotFracClusters10',
       'OotFracClusters5'],
      dtype='object')

In [20]:
full_df = partial_df.join(e_samp_df)

In [21]:
del e_samp_df

In [22]:
full_df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,JetGhostArea,...,EnergyPerSampling18,EnergyPerSampling19,EnergyPerSampling20,EnergyPerSampling21,EnergyPerSampling22,EnergyPerSampling23,EnergyPerSampling24,EnergyPerSampling25,EnergyPerSampling26,EnergyPerSampling27
0,90306.070312,0.308789,-1.921283,6840.930176,0.498666,0.318246,0.140055,-1.924426,0.489299,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,38400.785156,1.652859,1.518780,6869.742676,0.528585,1.665649,0.153715,1.519727,0.517078,0.01,...,3180.364258,-56.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,23870.822266,-0.104421,0.650640,4165.587891,0.468746,-0.099568,0.127942,0.659688,0.460727,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,20065.375000,3.540296,-0.125675,0.000000,0.468746,3.598612,0.128710,-0.120809,0.460141,0.01,...,0.000000,0.000000,0.000000,148891.0,-1982.236328,0.0,0.0,0.0,0.0,0.0
4,123364.226562,-1.344482,1.154525,10920.023438,0.488692,-1.343978,0.136743,1.147211,0.479669,0.01,...,57268.914062,39543.589844,1214.614746,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11951917,114079.367188,-0.394999,-2.638226,16422.587891,0.518612,-0.406506,0.149505,-2.651042,0.507893,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951918,99418.195312,1.514059,1.055971,14655.088867,0.528585,1.521448,0.152478,1.062383,0.517441,0.01,...,10682.195312,6861.119141,53.500000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951919,96017.484375,0.754035,1.596357,6563.273926,0.528585,0.755031,0.153312,1.598242,0.517397,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
11951920,65371.445312,-0.684227,1.889014,7944.513672,0.488692,-0.693882,0.137595,1.888648,0.479640,0.01,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [21]:
partial_train, partial_test = train_test_split(partial_df, test_size=0.2, random_state=41)
#full_train, full_test = train_test_split(full_df, test_size=0.2, random_state=41)
print(partial_train.shape, partial_test.shape)
#print(full_train.shape, full_test.shape)

(265480, 25) (66371, 25)


In [25]:
# Save train and test sets

#partial_train_20percent.to_pickle('processed_data/aod/all_jets_partial_train_20percent.pkl')
#partial_test_20percent.to_pickle('processed_data/aod/all_jets_partial_test_20percent.pkl')

#partial_train_10percent.to_pickle('processed_data/aod/all_jets_partial_train_10percent.pkl')
#partial_test_10percent.to_pickle('processed_data/aod/all_jets_partial_test_10percent.pkl')

#partial_train_5percent.to_pickle('processed_data/aod/all_jets_partial_train_5percent.pkl')
#partial_test_5percent.to_pickle('processed_data/aod/all_jets_partial_test_5percent.pkl')

#partial_train.to_pickle('processed_data/aod/all_jets_partial_train.pkl')
#partial_test.to_pickle('processed_data/aod/all_jets_partial_test.pkl')
#full_train.to_pickle('processed_data/aod/all_jets_full_train.pkl')
#full_test.to_pickle('processed_data/aod/all_jets_full_test.pkl')
# train = pd.read_pickle('processed_data/train.pkl')
# test = pd.read_pickle('processed_data/test.pkl')

In [23]:
partial_df.to_pickle('processed_data/aod/mc_1.pkl')