In [1]:
import tensorflow as tf
import awkward as ak
import numpy as np
import pickle
import glob
import os

In [2]:
# parquet_dir = os.path.join('/eos/cms/store/group/phys_jetmet/dholmber/jec-dnn/preprocessed/dev')

parquet_dir = '/ssd-home/hdaniel/lab/jec-dnn/data/test'
results_dir = '/ssd-home/hdaniel/lab/jec-dnn/results/notebook/deepset/1'

In [3]:
try:
    os.makedirs(results_dir)
except FileExistsError:
    pass

In [4]:
epochs = 10
batch_size = 256
loss = 'mean_absolute_error'
optimizer = 'adam'
lr = 1.e-3

activation = 'relu'
initializer = 'he_normal'
batch_norm = False
dropout = 0
units = [128, 128]

train_size = 0.6
test_size = 0.2
val_size = 0.2

In [5]:
jet_numerical = ['log_pt', 'eta', 'mass', 'phi', 'area', 'qgl_axis2', 'qgl_ptD', 'qgl_mult']
pf_numerical = ['rel_pt', 'rel_eta', 'rel_phi', 'd0', 'dz', 'd0Err', 'dzErr', 'trkChi2', 'vtxChi2', 'puppiWeight', 'puppiWeightNoLep']

In [6]:
categorical_map = {
    'jet': {
        'partonFlavour': [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 21]
    },
    'pf': {
        'charge': [-1, 0, 1],
        'lostInnerHits': [-1, 0, 1, 2],
        'pdgId': [-211, -13, -11, 1, 2, 11, 13, 22, 130, 211],
        'pvAssocQuality': [0, 1, 4, 5, 6, 7],
        'trkQuality': [0, 1, 5]
    }
}

In [7]:
jet_categorical = []
for key, categories in categorical_map['jet'].items():
    jet_categorical.extend([f'{key}_{i}' for i in range(len(categories))])
    
pf_categorical = []
for key, categories in categorical_map['pf'].items():
    pf_categorical.extend([f'{key}_{i}' for i in range(len(categories))])

In [8]:
jet_fields = jet_numerical + jet_categorical
pf_fields = pf_numerical + pf_categorical

jet_keys = [f'jet_{field}' for field in jet_fields]
pf_keys = [f'pf_{field}' for field in pf_fields]

num_jet = len(jet_keys)
num_pf = len(pf_keys)

In [9]:
dirs = glob.glob(os.path.join(parquet_dir, '*'))
num_dirs = len(dirs)
train_split = int(train_size * num_dirs)
test_split = int(test_size * num_dirs) + train_split

train_dirs = dirs[:train_split]
test_dirs = dirs[train_split:test_split]
val_dirs = dirs[test_split:]

In [10]:
train_dirs

['/ssd-home/hdaniel/lab/jec-dnn/data-selected/test/1',
 '/ssd-home/hdaniel/lab/jec-dnn/data-selected/test/5',
 '/ssd-home/hdaniel/lab/jec-dnn/data-selected/test/2']

In [11]:
def read_parquet(path):
    path = path.decode()

    jet = ak.from_parquet(os.path.join(path, 'jet.parquet'))
    pf = ak.from_parquet(os.path.join(path, 'pf.parquet'))
    
    row_lengths = ak.num(pf, axis=1)
    flat_pf = ak.flatten(pf, axis=1)
    
    data = [ak.to_numpy(row_lengths).astype(np.int32), ak.to_numpy(jet.target).astype(np.float32)]
    
    for field in jet_fields:
        data.append(ak.to_numpy(jet[field]).astype(np.float32))

    for field in pf_fields:
        data.append(ak.to_numpy(flat_pf[field]).astype(np.float32))
    
    return data

In [12]:
def read_parquet_wrapper(path):
    inp = [path]
    Tout = [tf.int32] + [tf.float32] + [tf.float32] * num_jet + [tf.float32] * num_pf
    
    cols = tf.numpy_function(read_parquet, inp=inp, Tout=Tout)
    
    keys = ['row_lengths'] + ['target'] + jet_keys + pf_keys
    data = {key: value for key, value in zip(keys, cols)}
    
    target = data.pop('target')
    target.set_shape((None,))
    
    row_lengths = data.pop('row_lengths')
    row_lengths.set_shape((None,))
    
    for key in jet_keys:
        # Shape from <unknown> to (None,)
        data[key].set_shape((None,))
        # Shape from (None,) to (None, 1)
        data[key] = tf.expand_dims(data[key], axis=1)
    
    for key in pf_keys:
        # Shape from <unknown> to (None,)
        data[key].set_shape((None,))
        # shape from (None,) to (None, None)
        data[key] = tf.RaggedTensor.from_row_lengths(data[key], row_lengths=row_lengths)
        # Shape from (None, None) to (None, None, 1)
        data[key] = tf.expand_dims(data[key], axis=2)
    
    jet_data = tf.concat([data[key] for key in jet_keys], axis=1)
    pf_data = tf.concat([data[key] for key in pf_keys], axis=2)
    inputs = (pf_data, jet_data)
    
    return inputs, target

In [13]:
def create_dataset(paths):
    ds = tf.data.Dataset.from_tensor_slices(paths)
    ds = ds.map(read_parquet_wrapper, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.unbatch().batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

In [14]:
train_ds = create_dataset(train_dirs).shuffle(64)
val_ds = create_dataset(val_dirs)
test_ds = create_dataset(test_dirs)

In [15]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Activation, Dense, TimeDistributed, BatchNormalization, Dropout, Concatenate, Add, Layer

In [16]:
class Sum(Layer):
    def __init__(self, axis, **kwargs):
        super().__init__(**kwargs)
        self.axis = axis

    def call(self, inputs):
        return tf.math.reduce_sum(inputs, axis=self.axis)

In [17]:
def get_deepset():
    constituents = Input(shape=(None, num_pf), ragged=True, name='constituents')

    constituents_slice = Input(shape=(constituents.shape[-1],), name='constituents_slice')

    deepset_outputs_slice = mlp(constituents_slice, name='deepset')

    deepset_model_slice = Model(inputs=constituents_slice, outputs=deepset_outputs_slice, name='deepset_model_slice')

    deepset_outputs = TimeDistributed(deepset_model_slice, name='deepset_distributed')(constituents)

    constituents_head = Sum(axis=1, name='constituents_head')(deepset_outputs)

    globals = Input(shape=(num_jet,), name='globals')

    inputs_head = Concatenate(name='head')([constituents_head, globals])

    x = mlp(inputs_head, name='head')

    outputs = Dense(1, name='head_dense_output')(x)

    model = Model(inputs=[constituents, globals], outputs=outputs, name='dnn')

    model.summary()

    for layer in model.layers:
        if isinstance(layer, TimeDistributed):
            layer.layer.summary()

    return model


def mlp(x, name):
    for idx, n in enumerate(units, start=1):
        x = Dense(n, kernel_initializer=initializer, name=f'{name}_dense_{idx}')(x)
        if batch_norm:
            x = BatchNormalization(name=f'{name}_batch_normalization_{idx}')(x)
        x = Activation(activation, name=f'{name}_activation_{idx}')(x)
        if dropout:
            x = Dropout(dropout, name=f'{name}_dropout_{idx}')(x)
    return x

In [18]:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    dnn = get_deepset()
    dnn.compile(optimizer=optimizer, loss=loss)
    dnn.optimizer.lr.assign(lr)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Model: "dnn"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
constituents (InputLayer)       [(None, None, 37)]   0                                            
__________________________________________________________________________________________________
deepset_distributed (TimeDistri (None, None, 128)    21376       constituents[0][0]               
__________________________________________________________________________________________________
constituents_head (Sum)         (None, 128)          0           deepset_distributed[0][0]        
__________________________________________________________________________________________________
globals (InputLayer)            [(None, 20)]         0                                       

In [19]:
# tf.keras.utils.plot_model(dnn, dpi=100, show_shapes=True, expand_nested=True)

In [22]:
def get_callbacks():
    # Reduce learning rate when nearing convergence
    reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.2, patience=5, min_lr=1.0e-8,
        mode='auto', min_delta=1.0e-4, cooldown=0, verbose=1
    )
    # Stop early if the network stops improving
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', min_delta=1.0e-4, 
        patience=7, mode='auto', baseline=None, 
        restore_best_weights=True, verbose=1
    )

    return [reduce_lr_on_plateau, early_stopping]

In [23]:
fit = dnn.fit(train_ds, validation_data=val_ds, epochs=epochs, callbacks=get_callbacks())

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
predictions = dnn.predict(test_ds, use_multiprocessing=True, workers=os.cpu_count()-1)

# Save predictions and corresponding test files
with open(os.path.join(results_dir, 'predictions.pkl'), 'wb') as f:
    pickle.dump((predictions, test_dirs), f)

# Save training history
with open(os.path.join(results_dir, 'history.pkl'), 'wb') as f:
    pickle.dump(fit.history, f)