In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import uproot

In [2]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [4]:
# Load AOD ntuple
path_to_data = '../data/aods/'

# Load a ROOT file
folder = 'data18_13TeV.00364292.calibration_DataScouting_05_Jets.deriv.DAOD_TRIG6.r10657_p3592_p3754/'
fname = 'DAOD_TRIG6.16825104._000035.pool.root'
filePath = path_to_data + folder + fname
#ttree = uproot.open(filePath)['outTree']['nominal']
tree = uproot.open(filePath)['CollectionTree']

In [5]:
tree.keys()

[b'ByteStreamEventInfo',
 b'TrigConfKeys',
 b'EventInfoAux.',
 b'xTrigDecisionAux.',
 b'EventInfo',
 b'xTrigDecision',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAux.',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollection',
 b'EventInfoAuxDyn.streamTagRobs',
 b'EventInfoAuxDyn.streamTagDets',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.eta',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.phi',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.m',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentLinks',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.constituentWeights',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.ConstituentScale',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum_pt',
 b'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.JetEMScaleMomentum

In [6]:
n_jets = sum(tree.array('HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn.pt').counts)

In [7]:
n_jets

11951922

In [18]:
prefix = 'HLT_xAOD__JetContainer_TrigHLTJetDSSelectorCollectionAuxDyn'
#prefix = 'HLT_xAOD__JetContainer_a4tcemsubjesISFSAuxDyn'
branchnames = [
    # 4-momentum
    prefix + '.pt',
    prefix + '.eta',
    prefix + '.phi',
    prefix + '.m',
    # Energy deposition in each calorimeter layer
    # prefix + '.EnergyPerSampling',
    # Area of jet,used for pile-up suppression (4-vector)
    prefix + '.ActiveArea',
    prefix + '.ActiveArea4vec_eta',
    prefix + '.ActiveArea4vec_m',
    prefix + '.ActiveArea4vec_phi',
    prefix + '.ActiveArea4vec_pt',
    # prefix + '.JetGhostArea',
    # Variables related to quality of jet
    prefix + '.AverageLArQF',
    # prefix + '.BchCorrCell',
    prefix + '.NegativeE',
    prefix + '.HECQuality',
    prefix + '.LArQuality',
    # Shape and position, most energetic cluster
    prefix + '.Width',
    prefix + '.WidthPhi',
    prefix + '.CentroidR',
    prefix + '.DetectorEta',
    prefix + '.LeadingClusterCenterLambda',
    prefix + '.LeadingClusterPt',
    prefix + '.LeadingClusterSecondLambda',
    prefix + '.LeadingClusterSecondR',
    prefix + '.N90Constituents',
    # Energy released in each calorimeter
    prefix + '.EMFrac',
    prefix + '.HECFrac',
    # Variables related to the time of arrival of a jet
    prefix + '.Timing',
    prefix + '.OotFracClusters10',
    prefix + '.OotFracClusters5',
]

In [19]:
len(branchnames)

27

In [20]:
df_dict = {}
for pp, branchname in enumerate(branchnames):
    if 'EnergyPerSampling' in branchname:
        pass
    else:
        variable = branchname.split('.')[1]
        df_dict[variable] = []
        jaggedX = tree.array(branchname)
        for ii, arr in enumerate(jaggedX):
            for kk, val in enumerate(arr):
                df_dict[variable].append(val)
    if pp % 3 == 0:
        print((pp * 100) // len(branchnames), '%')
print('100%')
print('Creating DataFrame...')
partial_df = pd.DataFrame(data=df_dict)
print('done.')

0 %
11 %
22 %
33 %
44 %
55 %
66 %
77 %
88 %
100%
Creating DataFrame...
done.


In [21]:
del df_dict

In [22]:
partial_df

Unnamed: 0,pt,eta,phi,m,ActiveArea,ActiveArea4vec_eta,ActiveArea4vec_m,ActiveArea4vec_phi,ActiveArea4vec_pt,AverageLArQF,...,LeadingClusterCenterLambda,LeadingClusterPt,LeadingClusterSecondLambda,LeadingClusterSecondR,N90Constituents,EMFrac,HECFrac,Timing,OotFracClusters10,OotFracClusters5
0,90306.070312,0.308789,-1.921283,6840.930176,0.498666,0.318246,0.140055,-1.924426,0.489299,160.179611,...,178.461990,60102.636719,15984.612305,13437.013672,3.0,0.888970,0.000000,1.078428,0.004533,0.117972
1,38400.785156,1.652859,1.518780,6869.742676,0.528585,1.665649,0.153715,1.519727,0.517078,959.112549,...,783.411194,7884.223633,4201.925293,18003.751953,12.0,0.863779,0.017756,-0.063829,0.045438,0.045438
2,23870.822266,-0.104421,0.650640,4165.587891,0.468746,-0.099568,0.127942,0.659688,0.460727,96.914841,...,247.017471,8997.914062,8866.488281,7871.465332,5.0,0.696391,0.000000,0.870779,0.075354,0.075354
3,20065.375000,3.540296,-0.125675,0.000000,0.468746,3.598612,0.128710,-0.120809,0.460141,122.971771,...,224.390015,9869.475586,481.461517,388.865387,1.0,1.013493,0.000000,0.162639,0.000000,0.000000
4,123364.226562,-1.344482,1.154525,10920.023438,0.488692,-1.343978,0.136743,1.147211,0.479669,170.411270,...,5170.814453,44042.378906,165553.234375,76716.382812,3.0,0.324276,0.000000,0.707971,0.019895,0.026191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11951917,114079.367188,-0.394999,-2.638226,16422.587891,0.518612,-0.406506,0.149505,-2.651042,0.507893,163.736084,...,372.346924,40667.167969,191986.656250,9667.271484,6.0,0.705080,0.000000,0.798371,0.000000,0.003736
11951918,99418.195312,1.514059,1.055971,14655.088867,0.528585,1.521448,0.152478,1.062383,0.517441,241.464371,...,73.819290,34442.792969,122756.203125,18290.041016,6.0,0.659999,0.100663,-0.496756,0.002235,0.019293
11951919,96017.484375,0.754035,1.596357,6563.273926,0.528585,0.755031,0.153312,1.598242,0.517397,269.018890,...,229.239685,45135.984375,16261.896484,8211.402344,3.0,0.840508,0.000000,1.625721,0.022706,0.206683
11951920,65371.445312,-0.684227,1.889014,7944.513672,0.488692,-0.693882,0.137595,1.888648,0.479640,1473.081909,...,187.929688,24728.742188,18120.142578,4310.512695,6.0,0.800413,0.000000,-0.010618,0.036225,0.053988


In [23]:
partial_df.columns

Index(['pt', 'eta', 'phi', 'm', 'ActiveArea', 'ActiveArea4vec_eta',
       'ActiveArea4vec_m', 'ActiveArea4vec_phi', 'ActiveArea4vec_pt',
       'AverageLArQF', 'NegativeE', 'HECQuality', 'LArQuality', 'Width',
       'WidthPhi', 'CentroidR', 'DetectorEta', 'LeadingClusterCenterLambda',
       'LeadingClusterPt', 'LeadingClusterSecondLambda',
       'LeadingClusterSecondR', 'N90Constituents', 'EMFrac', 'HECFrac',
       'Timing', 'OotFracClusters10', 'OotFracClusters5'],
      dtype='object')

In [24]:
partial_train, partial_test = train_test_split(partial_df, test_size=0.2, random_state=41)
#full_train, full_test = train_test_split(full_df, test_size=0.2, random_state=41)
print(partial_train.shape, partial_test.shape)
#print(full_train.shape, full_test.shape)

(9561537, 27) (2390385, 27)


In [25]:
partial_train_10percent = partial_train.sample(frac=0.1, random_state=42).reset_index(drop=True)  # Pick out a fraction of the data
partial_test_10percent = partial_test.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [25]:
# Save train and test sets

#partial_train_20percent.to_pickle('processed_data/aod/all_jets_partial_train_20percent.pkl')
#partial_test_20percent.to_pickle('processed_data/aod/all_jets_partial_test_20percent.pkl')

#partial_train_10percent.to_pickle('processed_data/aod/all_jets_partial_train_10percent.pkl')
#partial_test_10percent.to_pickle('processed_data/aod/all_jets_partial_test_10percent.pkl')

#partial_train_5percent.to_pickle('processed_data/aod/all_jets_partial_train_5percent.pkl')
#partial_test_5percent.to_pickle('processed_data/aod/all_jets_partial_test_5percent.pkl')

#partial_train.to_pickle('processed_data/aod/all_jets_partial_train.pkl')
#partial_test.to_pickle('processed_data/aod/all_jets_partial_test.pkl')
#full_train.to_pickle('processed_data/aod/all_jets_full_train.pkl')
#full_test.to_pickle('processed_data/aod/all_jets_full_test.pkl')
# train = pd.read_pickle('processed_data/train.pkl')
# test = pd.read_pickle('processed_data/test.pkl')

In [23]:
partial_df.to_pickle('processed_data/aod/mc_1.pkl')