In [1]:
import os
import gc
import sys
import glob
import time
import math
import numpy as np
import uproot
import pandas
from functools import partial
from concurrent.futures import ThreadPoolExecutor

import tensorflow as tf
import keras
import keras.backend as K
from keras import regularizers
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, Conv2D, Dropout, AlphaDropout, Activation, BatchNormalization, Flatten, \
                                    Concatenate, PReLU, TimeDistributed, LSTM, Masking
from keras.callbacks import Callback, ModelCheckpoint, CSVLogger
from keras_tqdm import TQDMNotebookCallback

sys.path.insert(0, "../../python")
from common import *
from DataLoader import DataLoader, read_hdf_lock

Using TensorFlow backend.


In [2]:
class MaskedDense(Dense):
    def __init__(self, units, **kwargs):
        super(MaskedDense, self).__init__(units, **kwargs)
        
    def call(self, inputs, mask=None):
        base_out = super(MaskedDense, self).call(inputs)
        if mask is None:
            return base_out
        zeros = tf.zeros_like(base_out)
        return tf.where(mask, base_out, zeros)

In [3]:
class SafeModelCheckpoint(ModelCheckpoint):
    def __init__(self, filepath, **kwargs):
        super(SafeModelCheckpoint, self).__init__(filepath, **kwargs)
    
    def on_epoch_end(self, epoch, logs=None):
        read_hdf_lock.acquire()
        super(SafeModelCheckpoint, self).on_epoch_end(epoch, logs)
        read_hdf_lock.release()

In [4]:
class NetSetup:
    def __init__(self, activation, activation_shared_axes, dropout_rate, first_layer_size, last_layer_size, decay_factor,
                 kernel_regularizer, time_distributed):
        self.activation = activation
        self.activation_shared_axes = activation_shared_axes
        if activation == 'relu' or activation == 'PReLU' or activation == 'tanh':
            self.DropoutType = Dropout
            self.kernel_init = 'he_uniform'
            self.apply_batch_norm = True
        elif activation == 'selu':
            self.DropoutType = AlphaDropout
            self.kernel_init = 'lecun_normal'
            self.apply_batch_norm = False
        else:
            raise RuntimeError('Activation "{}" not supported.'.format(activation))
        self.dropout_rate = dropout_rate
        self.first_layer_size = first_layer_size
        self.last_layer_size = last_layer_size
        self.decay_factor = decay_factor
        self.kernel_regularizer = kernel_regularizer
        self.time_distributed = time_distributed
    
    def RecalcLayerSizes(self, n_input_features, width_factor, compression_factor):
        self.first_layer_size = int(math.ceil(n_input_features * (1 + self.dropout_rate) * width_factor))
        self.last_layer_size = int(math.ceil(n_input_features * (1 + self.dropout_rate) * compression_factor))
    
def add_block_ending(net_setup, name_format, layer):
    if net_setup.apply_batch_norm:
        norm_layer = BatchNormalization(name=name_format.format('norm'))
        if net_setup.time_distributed:
            norm_layer = TimeDistributed(norm_layer, name=name_format.format('norm'))
        norm_layer = norm_layer(layer)
    else:
        norm_layer = layer
    if net_setup.activation == 'PReLU':
        activation_layer = PReLU(shared_axes=net_setup.activation_shared_axes,
                                 name=name_format.format('activation'))(norm_layer)
    else:
        activation_layer = Activation(net_setup.activation, name=name_format.format('activation'))(norm_layer)
    if net_setup.dropout_rate > 0:
        return net_setup.DropoutType(net_setup.dropout_rate, name=name_format.format('dropout'))(activation_layer)
    return activation_layer

def dense_block(prev_layer, kernel_size, net_setup, block_name, n):
    DenseType = MaskedDense if net_setup.time_distributed else Dense
    dense = DenseType(kernel_size, name="{}_dense_{}".format(block_name, n),
                      kernel_initializer=net_setup.kernel_init,
                      kernel_regularizer=net_setup.kernel_regularizer)
    if net_setup.time_distributed:
        dense = TimeDistributed(dense, name="{}_dense_{}".format(block_name, n))
    dense = dense(prev_layer)
    return add_block_ending(net_setup, '{}_{{}}_{}'.format(block_name, n), dense)

def reduce_n_features_1d(input_layer, net_setup, block_name):
    prev_layer = input_layer
    current_size = net_setup.first_layer_size
    n = 1
    while True:
        prev_layer = dense_block(prev_layer, current_size, net_setup, block_name, n)
        if current_size == net_setup.last_layer_size: break
        current_size = max(net_setup.last_layer_size, int(current_size / net_setup.decay_factor))
        n += 1
    return prev_layer

def dense_block_sequence(input_layer, net_setup, n_layers, block_name):
    prev_layer = input_layer
    current_size = net_setup.first_layer_size
    for n in range(n_layers):
        prev_layer = dense_block(prev_layer, current_size, net_setup, block_name, n+1)
    return prev_layer


def create_model(net_config):
    tau_net_setup = NetSetup('PReLU', None, 0.2, 128, 128, 1.4, None, False)
    comp_net_setup = NetSetup('PReLU', [1], 0.2, 1024, 64, 1.4, None, True)
    #dense_net_setup = NetSetup('relu', 0, 512, 32, 1.4, keras.regularizers.l1(1e-5))
    dense_net_setup = NetSetup('PReLU', None, 0.2, 512, 64, 1.4, None, False)
            
    input_layers = []
    high_level_features = []

    if len(net_config.tau_branches) > 0:
        input_layer_tau = Input(name="input_tau", shape=(len(net_config.tau_branches),))
        input_layers.append(input_layer_tau)
        tau_net_setup.RecalcLayerSizes(len(net_config.tau_branches), 2, 2)
        processed_tau = dense_block_sequence(input_layer_tau, tau_net_setup, 4, 'tau')
        high_level_features.append(processed_tau)
    
    for loc in net_config.cell_locations:
        reduced_inputs = []
        for comp_id in range(len(net_config.comp_names)):
            comp_name = net_config.comp_names[comp_id]
            n_comp_features = len(input_cell_external_branches) + len(net_config.comp_branches[comp_id])
            input_layer_comp = Input(name="input_{}_{}".format(loc, comp_name),
                                     shape=(n_cells_eta[loc] * n_cells_phi[loc], n_comp_features))
            input_layers.append(input_layer_comp)
            comp_net_setup.RecalcLayerSizes(n_comp_features, 2, 2)
            input_layer_comp_masked = Masking(name="input_{}_{}_masking".format(loc, comp_name))(input_layer_comp)
            reduced_comp = dense_block_sequence(input_layer_comp_masked, comp_net_setup, 4, "{}_{}".format(loc, comp_name))
            reduced_inputs.append(reduced_comp)
            
        cell_output_size = 32
        if len(net_config.comp_names) > 1:
            cell_input = Concatenate(name="{}_cell_concat_0".format(loc), axis=2)(reduced_inputs)
        else:
            cell_input = reduced_inputs[0]
        
        cell_lstm_1 = LSTM(units=cell_output_size, return_sequences=True, name='{}_lstm_1'.format(loc))(cell_input)
        cell_lstm_1_drop = Dropout(comp_net_setup.dropout_rate, name='{}_lstm_1_dropout'.format(loc))(cell_lstm_1)
        
        cell_input_1 = Concatenate(name="{}_cell_concat_1".format(loc), axis=2)([cell_input, cell_lstm_1_drop])
        cell_lstm_2 = LSTM(units=cell_output_size, return_sequences=True, name='{}_lstm_2'.format(loc))(cell_input_1)
        cell_lstm_2_drop = Dropout(comp_net_setup.dropout_rate, name='{}_lstm_2_dropout'.format(loc))(cell_lstm_2)

        cell_input_2 = Concatenate(name="{}_cell_concat_2".format(loc), axis=2)([cell_input, cell_lstm_2_drop])
        cell_lstm_3 = LSTM(units=cell_output_size, return_sequences=False, name='{}_lstm_3'.format(loc))(cell_input_2)
        cell_lstm_3_drop = Dropout(comp_net_setup.dropout_rate, name='{}_lstm_3_dropout'.format(loc))(cell_lstm_3)

        high_level_features.append(cell_lstm_3_drop)
        
    if len(high_level_features) > 1:
        features_concat = Concatenate(name="features_concat")(high_level_features)
    else:
        features_concat = high_level_features[0]
    if net_config.final:
        dense_net_setup.first_layer_size = features_concat.shape[1]
        final_dense = dense_block_sequence(features_concat, dense_net_setup, 4, 'final')
        output_layer = Dense(n_outputs, name="final_dense_last",
                     kernel_initializer=dense_net_setup.kernel_init)(final_dense)

    else:
        final_dense = dense_block(features_concat, 1024, dense_net_setup,
                                  'tmp_{}'.format(net_config.name), 1)
        output_layer = Dense(n_outputs, name="tmp_{}_dense_last".format(net_config.name),
                             kernel_initializer=dense_net_setup.kernel_init)(final_dense)
    softmax_output = Activation("softmax", name="main_output")(output_layer)
    
    model = Model(input_layers, softmax_output, name="DeepTau2017v2")
    return model

In [5]:
def compile_model(model, learning_rate):
    opt = keras.optimizers.Adam(lr=learning_rate)
    #opt = keras.optimizers.Nadam(lr=learning_rate)
    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])
    metrics = [ "accuracy", TauLosses.tau_crossentropy, TauLosses.tau_crossentropy_v2, TauLosses.Le, TauLosses.Lmu,
               TauLosses.Ljet, TauLosses.He, TauLosses.Hmu, TauLosses.Htau, TauLosses.Hjet ]
    model.compile(loss=TauLosses.tau_crossentropy_v2, optimizer=opt, metrics=metrics, weighted_metrics=metrics)

In [6]:
TauLosses.SetSFs(1, 1, 3, 1)
model_name = "DeepTau2017v2p4_preTau"
model = create_model(netConf_preTau)
compile_model(model, 1e-3)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_tau (InputLayer)       (None, 47)                0         
_________________________________________________________________
tau_dense_1 (Dense)          (None, 113)               5424      
_________________________________________________________________
tau_norm_1 (BatchNormalizati (None, 113)               452       
_________________________________________________________________
tau_activation_1 (PReLU)     (None, 113)               113       
_________________________________________________________________
tau_dropout_1 (Dropout)      (None, 113)               0         
_________________________________________________________________
tau_dense_2 (Dense)  

In [7]:
def GetNumberOfTrainableParams(model, exclude_prefix = 'tmp'):
    count = 0
    for w in model.trainable_weights:
        if not w.name.startswith(exclude_prefix):
            count += K.count_params(w)
    return count
GetNumberOfTrainableParams(model)

45426

In [8]:
def close_file(f_name):
    file_objs = [ obj for obj in gc.get_objects() if ("TextIOWrapper" in str(type(obj))) and (obj.name == f_name)]
    for obj in file_objs:
        obj.close()

In [9]:
class TimeCheckpoint(Callback):
    def __init__(self, time_interval, file_name_prefix):
        self.time_interval = time_interval
        self.file_name_prefix = file_name_prefix
        self.initial_time = time.time()
        self.last_check_time = self.initial_time
    
    def on_batch_end(self, batch, logs=None):
        if batch % 100 != 0: return
        current_time = time.time()
        delta_t = current_time - self.last_check_time
        if delta_t >= self.time_interval:
            abs_delta_t_h = (current_time - self.initial_time) / 60. / 60.
            read_hdf_lock.acquire()
            self.model.save('{}_b{}_{:.1f}h.h5'.format(self.file_name_prefix, batch, abs_delta_t_h))
            read_hdf_lock.release()
            self.last_check_time = current_time

In [10]:
def run_training(train_suffix, model_name, data_loader, epoch, n_epochs):

    train_name = '%s_%s' % (model_name, train_suffix)
    
    cb_acc = []
    for acc_name in ["acc", "weighted_acc"]:
        cb_acc.append(SafeModelCheckpoint("%s_acc.hdf5" % train_name, monitor="val_%s" % acc_name, save_best_only=True,
                                          save_weights_only=False, mode="max", verbose=1))
    
    losses_names = [ "loss" ]
    for w_suffix in [ "", "weighted_" ]:
        for l_name in [ "tau_crossentropy", "tau_crossentropy_v2", "Le", "Lmu", "Ljet", "He", "Hmu", "Htau", "Hjet" ]:
            losses_names.append(w_suffix + l_name)
    cb_losses = []
    for loss_name in losses_names:
        cb_losses.append(ModelCheckpoint("%s_%s.hdf5" % (train_name, loss_name), monitor="val_%s" % loss_name,
                                         save_best_only=True, save_weights_only=False, mode="min", verbose=1))

    log_name = "%s.log" % train_name
    if os.path.isfile(log_name):
        close_file(log_name)
        os.remove(log_name)
    csv_log = CSVLogger(log_name, append=True)

    time_checkpoint = TimeCheckpoint(4*60*60, '{}_historic'.format(train_name))
    pbar = TQDMNotebookCallback(leave_outer=True, show_outer=True, leave_inner = True)
    callbacks = [pbar, time_checkpoint, csv_log, *cb_acc, *cb_losses]
    fit_hist = model.fit_generator(data_loader.generator(True), validation_data=data_loader.generator(False),
                                   steps_per_epoch=data_loader.steps_per_epoch, validation_steps=data_loader.validation_steps,
                                   callbacks=callbacks, epochs=n_epochs, initial_epoch=epoch, verbose=0)

    model.save("%s_final.hdf5" % train_name)
    return fit_hist

In [11]:
loader = DataLoader('N:/tau-ml/tuples-v2-t3/training/part_*.h5', netConf_preTau, 10000, 1000000, validation_size=6100000,
                    max_queue_size=200, n_passes=-1, return_grid=False)

print(loader.total_size, loader.data_size, loader.validation_size)

72491602 66391602 6100000


In [12]:
fit_hist = run_training('step{}'.format(1), model_name, loader, 0, 2)

Instructions for updating:
Use tf.cast instead.


HBox(children=(IntProgress(value=0, description='Training', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=6640, style=ProgressStyle(description_width='in…


Epoch 00001: val_acc improved from -inf to 0.87661, saving model to DeepTau2017v2p4_preTau_step1_acc.hdf5

Epoch 00001: val_weighted_acc improved from -inf to 0.86736, saving model to DeepTau2017v2p4_preTau_step1_acc.hdf5

Epoch 00001: val_loss improved from inf to 0.37294, saving model to DeepTau2017v2p4_preTau_step1_loss.hdf5

Epoch 00001: val_tau_crossentropy improved from inf to 0.22055, saving model to DeepTau2017v2p4_preTau_step1_tau_crossentropy.hdf5

Epoch 00001: val_tau_crossentropy_v2 improved from inf to 0.34176, saving model to DeepTau2017v2p4_preTau_step1_tau_crossentropy_v2.hdf5

Epoch 00001: val_Le improved from inf to 0.08699, saving model to DeepTau2017v2p4_preTau_step1_Le.hdf5

Epoch 00001: val_Lmu improved from inf to 0.08879, saving model to DeepTau2017v2p4_preTau_step1_Lmu.hdf5

Epoch 00001: val_Ljet improved from inf to 0.15505, saving model to DeepTau2017v2p4_preTau_step1_Ljet.hdf5

Epoch 00001: val_He improved from inf to 0.09288, saving model to DeepTau2017v2p



HBox(children=(IntProgress(value=0, description='Epoch 1', max=6640, style=ProgressStyle(description_width='in…


Epoch 00002: val_acc improved from 0.87661 to 0.87697, saving model to DeepTau2017v2p4_preTau_step1_acc.hdf5

Epoch 00002: val_weighted_acc improved from 0.86736 to 0.87012, saving model to DeepTau2017v2p4_preTau_step1_acc.hdf5

Epoch 00002: val_loss improved from 0.37294 to 0.32965, saving model to DeepTau2017v2p4_preTau_step1_loss.hdf5

Epoch 00002: val_tau_crossentropy improved from 0.22055 to 0.21342, saving model to DeepTau2017v2p4_preTau_step1_tau_crossentropy.hdf5

Epoch 00002: val_tau_crossentropy_v2 improved from 0.34176 to 0.30920, saving model to DeepTau2017v2p4_preTau_step1_tau_crossentropy_v2.hdf5

Epoch 00002: val_Le did not improve from 0.08699

Epoch 00002: val_Lmu improved from 0.08879 to 0.08146, saving model to DeepTau2017v2p4_preTau_step1_Lmu.hdf5

Epoch 00002: val_Ljet improved from 0.15505 to 0.14485, saving model to DeepTau2017v2p4_preTau_step1_Ljet.hdf5

Epoch 00002: val_He did not improve from 0.09288

Epoch 00002: val_Hmu did not improve from 0.09193

Epoch 0