In [1]:
import os
import glob
import awkward as ak
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import itertools

In [2]:
from coffea.nanoevents import NanoEventsFactory, PFNanoAODSchema
PFNanoAODSchema.warn_missing_crossrefs = False
import warnings

In [3]:
data_dir = '/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn'

In [4]:
dataset = 'dev'

in_dir = os.path.join(data_dir, 'raw', dataset)
out_dir = os.path.join(data_dir, 'preprocessed', dataset)

root_files = glob.glob(os.path.join(in_dir, '*.root'))
num_files = len(root_files)

In [5]:
try:
    os.makedirs(out_dir)
except FileExistsError:
    pass

In [6]:
def read_nanoaod(path):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message='found duplicate branch')
        events = NanoEventsFactory.from_root(path, schemaclass=PFNanoAODSchema).events()

    jets = events.Jet[(ak.count(events.Jet.matched_gen.pt, axis=1) >= 2)]

    sorted_jets = jets[ak.argsort(jets.matched_gen.pt, ascending=False, axis=1)]

    leading_jets = ak.concatenate((sorted_jets[:,0], sorted_jets[:,1]), axis=0)

    selected_jets = leading_jets[(leading_jets.matched_gen.pt > 30) & (abs(leading_jets.matched_gen.eta) < 5)]

    valid_jets = selected_jets[~ak.is_none(selected_jets.matched_gen.pt)]

    for field in ['dz', 'dzErr', 'd0', 'd0Err']:
        valid_jets = valid_jets[ak.all(valid_jets.constituents.pf[field] != np.inf, axis=1)]

    return valid_jets, valid_jets.constituents.pf

In [7]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', message='found duplicate branch')
    events = NanoEventsFactory.from_root(root_files[0], schemaclass=PFNanoAODSchema).events()

In [8]:
def preprocess(jet, pf):
    jet['target'] = jet.matched_gen.pt / jet.pt
    jet['log_pt'] = np.log(jet.pt)
    pf['rel_eta'] = (pf.eta - jet.eta) * np.sign(jet.eta)
    pf['rel_pt'] = pf.pt / jet.pt
    pf['rel_phi'] = (pf.phi - jet.phi + np.pi) % (2 * np.pi) - np.pi
    return jet, pf

### Embeddings

In [9]:
import tensorflow as tf
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(3, 5))
input_array = np.array([0, 1, 2, 1, 2, 0])
model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
print(output_array)

[[[ 0.01463919  0.04959511  0.01747364  0.04252677 -0.02428002]]

 [[-0.02592992 -0.03550162  0.045031   -0.04034467  0.03529284]]

 [[-0.00379109  0.02771599 -0.03886534  0.03927812 -0.04328855]]

 [[-0.02592992 -0.03550162  0.045031   -0.04034467  0.03529284]]

 [[-0.00379109  0.02771599 -0.03886534  0.03927812 -0.04328855]]

 [[ 0.01463919  0.04959511  0.01747364  0.04252677 -0.02428002]]]


### One Hot

In [10]:
jet, pf = read_nanoaod(root_files[0])
jet, pf = preprocess(jet, pf)

In [11]:
def one_hot_encode(array, categories):
    cardinality = len(categories)
    category_map = dict(zip(categories, range(cardinality)))
    for i, val in enumerate(array):
        array[i] = category_map[val]
    return np.eye(cardinality)[array]

In [13]:
categories = [0, 4, 6, 7]
encoded_matrix = one_hot_encode(np.array(jet.puId), categories)
encoded_matrix = one_hot_encode(np.array(jet.puId), categories)
for i in range(len(categories)):
    jet[f'puId_{i}'] = encoded_matrix[:,i]

In [15]:
counts = ak.num(pf)
flat_pf = ak.flatten(pf)
categories = [-1, 0, 1]
encoded_matrix = one_hot_encode(np.array(flat_pf.charge), categories)
for i in range(len(categories)):
    flat_pf[f'charge_{i}'] = encoded_matrix[:,i]
pf = ak.unflatten(flat_pf, counts)

###

In [18]:
def create_dataset(root_file, parquet_dir):
    print(parquet_dir)
    
    jet, pf = read_nanoaod(root_file)
    jet, pf = preprocess(jet, pf)
    
    try:
        os.makedirs(parquet_dir)
    except FileExistsError:
        pass
    
    ak.to_parquet(jet, os.path.join(parquet_dir, 'jet.parquet'))
    ak.to_parquet(pf, os.path.join(parquet_dir, 'pf.parquet'))

In [19]:
parquet_dirs = ['/'.join((path, str(index))) for index, path in enumerate(itertools.repeat(out_dir, num_files), start=1)]
for i in range(len(root_files)):
    create_dataset(root_files[i], parquet_dirs[i])

/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/1
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/2
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/3
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/4
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/5


In [None]:
# with ProcessPoolExecutor(max_workers=None) as executor:
#     parquet_dirs = ['/'.join((path, str(index))) for index, path in enumerate(itertools.repeat(out_dir, num_files), start=1)]
#     results = executor.map(create_dataset, root_files, parquet_dirs)

## cheers