In [1]:
import os
import glob
import awkward as ak
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import itertools

In [2]:
from coffea.nanoevents import NanoEventsFactory, PFNanoAODSchema
PFNanoAODSchema.warn_missing_crossrefs = False
import warnings

In [3]:
data_dir = '/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn'

In [4]:
in_dir = os.path.join(data_dir, 'raw/dev')
out_dir = os.path.join(data_dir, 'preprocessed/dev')

root_files = glob.glob(os.path.join(in_dir, '*.root'))
num_files = len(root_files)

In [5]:
try:
    os.makedirs(out_dir)
except FileExistsError:
    pass

In [6]:
import uproot

file = uproot.open(os.path.join(data_dir, 'raw/dev/1.root'))
file.classnames()

{'tag': 'TObjString',
 'Events': 'TTree',
 'LuminosityBlocks': 'TTree',
 'Runs': 'TTree',
 'MetaData': 'TTree',
 'ParameterSets': 'TTree'}

In [7]:
ak.to_pandas(file['Events'].arrays(filter_name='Jet*'))

Unnamed: 0_level_0,Unnamed: 1_level_0,JetPFCands_pt,JetPFCands_btagEtaRel,JetPFCands_btagPtRatio,JetPFCands_btagPParRatio,JetPFCands_btagSip3dVal,JetPFCands_btagSip3dSig,JetPFCands_btagJetDistVal,JetPFCands_pFCandsIdx,JetPFCands_jetIdx,JetSVs_mass,...,JetCalo_genJetIdx,JetCalo_hadronFlavour,JetCalo_partonFlavour,JetPuppi_genJetIdx,JetPuppi_hadronFlavour,JetPuppi_partonFlavour,Jet_genJetIdx,Jet_hadronFlavour,Jet_partonFlavour,Jet_cleanmask
entry,subentry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0.613281,2.164062,0.185425,0.982422,1.360352,196.625000,-0.783203,8,0,2.960938,...,1,0,0,1,0,21,1,0,21,1
0,1,2.363281,2.998047,0.091553,0.995605,3.470703,10.468750,-2.367188,14,0,0.595215,...,0,0,0,0,0,2,0,0,2,1
0,2,6.078125,3.232422,0.076904,0.997070,0.005253,1.998047,-0.003897,26,0,2.134766,...,2,0,0,2,0,21,2,0,21,1
1,0,0.533203,1.989258,0.170776,0.985352,-0.009315,-0.952148,-0.008171,6,0,1.037109,...,0,0,0,1,0,2,0,0,2,1
1,1,0.731934,2.035156,0.206665,0.978516,-0.001961,-0.219360,-0.001163,7,0,2.535156,...,1,0,0,0,0,2,1,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,1,0.529785,1.714844,0.280029,0.959961,-0.014038,-1.437500,-0.013992,9,0,0.526367,...,1,0,0,1,0,1,1,0,1,0
99,2,0.593262,1.864258,0.214478,0.976562,-0.004753,-0.503418,-0.003695,10,0,5.605469,...,2,0,0,2,0,21,2,0,21,1
99,3,0.883301,1.724609,0.312256,0.950195,0.005814,0.785156,-0.005394,11,0,7.648438,...,3,0,0,3,0,21,3,0,21,1
99,4,0.586914,1.652344,0.296631,0.955078,-3.994141,-345.500000,-1.142578,24,0,1.969727,...,4,0,0,4,0,2,4,0,2,1


In [8]:
ak.to_pandas(file['Events'].arrays(filter_name='JetPFCands*'))

Unnamed: 0_level_0,Unnamed: 1_level_0,JetPFCands_pt,JetPFCands_btagEtaRel,JetPFCands_btagPtRatio,JetPFCands_btagPParRatio,JetPFCands_btagSip3dVal,JetPFCands_btagSip3dSig,JetPFCands_btagJetDistVal,JetPFCands_pFCandsIdx,JetPFCands_jetIdx
entry,subentry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0.613281,2.164062,0.185425,0.982422,1.360352,196.625000,-0.783203,8,0
0,1,2.363281,2.998047,0.091553,0.995605,3.470703,10.468750,-2.367188,14,0
0,2,6.078125,3.232422,0.076904,0.997070,0.005253,1.998047,-0.003897,26,0
0,3,2.050781,3.166016,0.067932,0.997559,0.024399,6.585938,-0.005470,30,0
0,4,14.437500,3.689453,0.049408,0.998535,0.006115,2.253906,-0.005650,31,0
...,...,...,...,...,...,...,...,...,...,...
99,307,1.137695,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,159,12
99,308,1.628906,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,160,12
99,309,0.192383,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,163,12
99,310,1.119141,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,164,12


In [9]:
def read_nanoaod(path):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message='found duplicate branch')
        events = NanoEventsFactory.from_root(path, schemaclass=PFNanoAODSchema).events()

    jets = events.Jet[(ak.count(events.Jet.matched_gen.pt, axis=1) >= 2)]

    sorted_jets = jets[ak.argsort(jets.matched_gen.pt, ascending=False, axis=1)]

    leading_jets = ak.concatenate((sorted_jets[:,0], sorted_jets[:,1]), axis=0)

    selected_jets = leading_jets[(leading_jets.matched_gen.pt > 30) & (abs(leading_jets.matched_gen.eta) < 5)]

    valid_jets = selected_jets[~ak.is_none(selected_jets.matched_gen.pt)]

    for field in ['dz', 'dzErr', 'd0', 'd0Err']:
        valid_jets = valid_jets[ak.all(valid_jets.constituents.pf[field] != np.inf, axis=1)]

    return valid_jets, valid_jets.constituents.pf

In [10]:
def preprocess(jet, pf):
    jet['target'] = pf.pt / jet.matched_gen.pt
    jet['pt_log'] = np.log(jet.pt)
    pf['rel_eta'] = (pf.eta - jet.eta) * np.sign(jet.eta)
    pf['rel_pt'] = pf.pt / jet.pt
    pf['rel_phi'] = (pf.phi - jet.phi + np.pi) % (2 * np.pi) - np.pi
    return jet, pf

In [11]:
def create_dataset(root_file, parquet_dir):
    print(parquet_dir + '\n')
    
    jet, pf = read_nanoaod(root_file)
    jet, pf = preprocess(jet, pf)
    
    try:
        os.makedirs(parquet_dir)
    except FileExistsError:
        pass
    
    ak.to_parquet(jet, os.path.join(parquet_dir, 'jet.parquet'))
    ak.to_parquet(pf, os.path.join(parquet_dir, 'pf.parquet'))

In [12]:
with ProcessPoolExecutor(max_workers=None) as executor:
    parquet_dirs = ['/'.join((path, str(index))) for index, path in enumerate(itertools.repeat(out_dir, num_files), start=1)]
    results = executor.map(create_dataset, root_files, parquet_dirs)

/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/1
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/3

/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/2
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/4



/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/5



## cheers