In [1]:
import os
import glob
import json
import awkward as ak
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import itertools

In [2]:
from coffea.nanoevents import NanoEventsFactory, PFNanoAODSchema
PFNanoAODSchema.warn_missing_crossrefs = False
import warnings

In [3]:
data_dir = '/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn'

In [4]:
in_dir = os.path.join(data_dir, 'raw/dev')
out_dir = os.path.join(data_dir, 'preprocessed/dev')

root_files = glob.glob(os.path.join(in_dir, '*.root'))
num_files = len(root_files)

In [5]:
try:
    os.makedirs(out_dir)
except FileExistsError:
    pass

In [6]:
events = NanoEventsFactory.from_root(os.path.join(data_dir, 'raw/dev/1.root'), schemaclass=PFNanoAODSchema).events()

In [8]:
all_jet_fields = list(filter(lambda field: 'IdxG' not in field, events.Jet.fields)) + ['target', 'log_pt']
all_pf_fields = list(filter(lambda field: 'IdxG' not in field, events.Jet.constituents.pf.fields)) + ['rel_eta', 'rel_phi', 'rel_pt']

all_jet_keys = [f'jet_{field}' for field in all_jet_fields]
all_pf_keys = [f'pf_{field}' for field in all_pf_fields]

In [9]:
def read_nanoaod(path):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message='found duplicate branch')
        events = NanoEventsFactory.from_root(path, schemaclass=PFNanoAODSchema).events()

    jets = events.Jet[(ak.count(events.Jet.matched_gen.pt, axis=1) >= 2)]

    sorted_jets = jets[ak.argsort(jets.matched_gen.pt, ascending=False, axis=1)]

    leading_jets = ak.concatenate((sorted_jets[:,0], sorted_jets[:,1]), axis=0)

    selected_jets = leading_jets[(leading_jets.matched_gen.pt > 30) & (abs(leading_jets.matched_gen.eta) < 5)]

    valid_jets = selected_jets[~ak.is_none(selected_jets.matched_gen.pt)]

    for field in ['dz', 'dzErr', 'd0', 'd0Err']:
        valid_jets = valid_jets[ak.all(valid_jets.constituents.pf[field] != np.inf, axis=1)]

    return valid_jets, valid_jets.constituents.pf

In [10]:
def preprocess(jet, pf):
    jet['target'] = jet.matched_gen.pt / jet.pt
    jet['log_pt'] = np.log(jet.pt)
    pf['rel_eta'] = (pf.eta - jet.eta) * np.sign(jet.eta)
    pf['rel_pt'] = pf.pt / jet.pt
    pf['rel_phi'] = (pf.phi - jet.phi + np.pi) % (2 * np.pi) - np.pi
    return jet, pf

In [None]:
def create_dataset(root_file, parquet_dir):
    print(parquet_dir + '\n')
    
    jet, pf = read_nanoaod(root_file)
    jet, pf = preprocess(jet, pf)
    
    try:
        os.makedirs(parquet_dir)
    except FileExistsError:
        pass
    
    ak.to_parquet(jet, os.path.join(parquet_dir, 'jet.parquet'))
    ak.to_parquet(pf, os.path.join(parquet_dir, 'pf.parquet'))

In [None]:
with ProcessPoolExecutor(max_workers=None) as executor:
    parquet_dirs = ['/'.join((path, str(index))) for index, path in enumerate(itertools.repeat(out_dir, num_files), start=1)]
    results = executor.map(create_dataset, root_files, parquet_dirs)

## Testing TFRecord

In [None]:
jet, pf = read_nanoaod(root_files[0])

In [12]:
import tensorflow as tf

In [13]:
def float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [14]:
def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [15]:
def serialize_example(jet, flat_pf, row_lengths):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    
    jet_dict = {key: float_feature(value) for key, value in zip(all_jet_keys, ak.unzip(jet[all_jet_fields]))}
    pf_dict = {key: float_feature(value) for key, value in zip(all_pf_keys, ak.unzip(flat_pf[all_pf_fields]))}
    
    feature = {'row_lengths': int64_feature(row_lengths)}
    feature.update(jet_dict)
    feature.update(pf_dict)
    
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [16]:
def create_record(root_file, record_file):
    print(record_file + '\n')
    
    jet, pf = read_nanoaod(root_file)
    jet, pf = preprocess(jet, pf)
    
    row_lengths = ak.num(pf, axis=1)
    flat_pf = ak.flatten(pf, axis=1)

    ex = serialize_example(jet, flat_pf, row_lengths)
    
    with tf.io.TFRecordWriter(record_file) as writer:
        writer.write(ex)

In [17]:
root_names = [os.path.basename(file) for file in root_files]
record_names = [f'{os.path.splitext(file)[0]}.tfrecords' for file in root_names]
record_files = [os.path.join(out_dir, record_name) for record_name in record_names]

In [18]:
with ProcessPoolExecutor(max_workers=None) as executor:
    results = executor.map(create_record, root_files, record_files)

/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/1.tfrecords
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/2.tfrecords
/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/4.tfrecords


/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/3.tfrecords


/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev/5.tfrecords



In [19]:
list(results)

[None, None, None, None, None]

In [20]:
record_files = glob.glob(os.path.join(out_dir, '*.tfrecords'))

In [24]:
raw_ds = tf.data.TFRecordDataset(filenames=[record_files])

In [22]:
features = {}
for key in all_jet_keys:
    features[key] = tf.io.VarLenFeature(dtype=tf.float32)
for key in all_pf_keys:
    features[key] = tf.io.VarLenFeature(dtype=tf.float32)
features['row_length'] = tf.io.VarLenFeature(dtype=tf.int64)

def parse_record(example_proto):
    return tf.io.parse_single_example(example_proto, features=features)

In [25]:
parsed_ds = raw_ds.map(parse_record)

In [None]:
def parse_example(raw_example):
    example = tf.io.parse_single_example(raw_example, {
        'vals': tf.io.VarLenFeature(dtype=tf.float32),
        'lens': tf.io.VarLenFeature(dtype=tf.float32)
    })
    return tf.RaggedTensor.from_row_lengths(
        example['vals'].values, row_lengths=example['lens'].values
    )

In [62]:
d = next(iter(parsed_ds))

In [63]:
d.keys()

dict_keys(['jet_DeepCSV_flightDistance2dSig', 'jet_DeepCSV_flightDistance2dVal', 'jet_DeepCSV_flightDistance3dSig', 'jet_DeepCSV_flightDistance3dVal', 'jet_DeepCSV_jetNSecondaryVertices', 'jet_DeepCSV_jetNSelectedTracks', 'jet_DeepCSV_jetNTracksEtaRel', 'jet_DeepCSV_trackDecayLenVal_0', 'jet_DeepCSV_trackDecayLenVal_1', 'jet_DeepCSV_trackDecayLenVal_2', 'jet_DeepCSV_trackDecayLenVal_3', 'jet_DeepCSV_trackDecayLenVal_4', 'jet_DeepCSV_trackDecayLenVal_5', 'jet_DeepCSV_trackDeltaR_0', 'jet_DeepCSV_trackDeltaR_1', 'jet_DeepCSV_trackDeltaR_2', 'jet_DeepCSV_trackDeltaR_3', 'jet_DeepCSV_trackDeltaR_4', 'jet_DeepCSV_trackDeltaR_5', 'jet_DeepCSV_trackEtaRel_0', 'jet_DeepCSV_trackEtaRel_1', 'jet_DeepCSV_trackEtaRel_2', 'jet_DeepCSV_trackEtaRel_3', 'jet_DeepCSV_trackJetDistVal_0', 'jet_DeepCSV_trackJetDistVal_1', 'jet_DeepCSV_trackJetDistVal_2', 'jet_DeepCSV_trackJetDistVal_3', 'jet_DeepCSV_trackJetDistVal_4', 'jet_DeepCSV_trackJetDistVal_5', 'jet_DeepCSV_trackJetPt', 'jet_DeepCSV_trackPtRatio_0'

In [64]:
d['jet_pt'].values

<tf.Tensor: shape=(200,), dtype=float32, numpy=
array([1332. , 1524. , 1607. , 1676. , 1541. , 1515. , 1585. , 1213. ,
       1581. , 1492. , 1514. , 1635. , 1760. , 1008. , 1468. , 1782. ,
       1426. , 1480. , 1306. , 1637. , 1437. , 1484. , 1387. , 1430. ,
       1317. , 1424. , 1391. , 1580. , 1410. , 1483. , 1487. , 1555. ,
       1632. , 1553. , 1454. , 1707. , 1459. , 1377. , 1309. , 1548. ,
       1489. , 1538. , 2064. , 1545. , 1392. , 1409. , 1433. , 1247. ,
       1553. , 1717. , 1640. , 1555. , 1622. , 1517. , 1455. , 1560. ,
       1486. , 1453. , 1489. , 1492. , 1651. , 1673. , 1493. , 1447. ,
       1574. , 1511. , 1370. , 1611. , 1051. , 1677. , 1651. , 1504. ,
       1495. , 1774. , 1568. , 1514. , 1731. , 1401. , 1581. , 1624. ,
       1774. , 1782. , 1611. , 1298. , 1337. , 1667. , 1630. , 1459. ,
       1776. , 1603. , 1470. , 1480. , 1653. , 1501. , 1475. , 1414. ,
       1433. , 1748. , 1561. , 2025. , 1681. , 1380. , 1546. , 1411. ,
       1604. , 1434. , 1492. 

In [29]:
jet_numerical = ['log_pt', 'eta', 'mass', 'phi', 'area', 'qgl_axis2', 'qgl_ptD', 'qgl_mult']
jet_categorical = ['puId', 'partonFlavour']

pf_numerical = ['rel_pt', 'rel_eta', 'rel_phi', 'd0', 'dz', 'd0Err', 'dzErr', 'trkChi2', 'vtxChi2', 'puppiWeight', 'puppiWeightNoLep']
pf_categorical = ['charge', 'lostInnerHits', 'pdgId', 'pvAssocQuality', 'trkQuality']

jet_fields = jet_numerical + jet_categorical
pf_fields = pf_numerical + pf_categorical

jet_keys = [f'jet_{field}' for field in jet_fields]
pf_keys = [f'pf_{field}' for field in pf_fields]

In [55]:
def select_features(data):
    print(data['jet_pt'].values)
    jet_data = tf.concat([tf.expand_dims(data[key].values, axis=1) for key in jet_keys], axis=1)
    
    pf_data = tf.concat([tf.expand_dims(data[key].values, axis=2) for key in pf_keys], axis=1)
    
    pf_data = tf.RaggedTensor.from_row_lengths(pf_data, row_lengths=data['row_lengths'])
    
    inputs = (pf_data, jet_data)
    return inputs, data['target']

In [57]:
# ds = parsed_ds.map(select_features)