In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf 
import keras
print(tf.__version__)
print(keras.__version__)
print(pd.__version__)
print(np.__version__)

2.6.0
2.6.0
1.3.4
1.19.5


In [2]:
# custom imports 
import glob
from nn_globals import *
from nn_plotting import gaus, fit_gaus, corr_plot

In [3]:
def _handle_nan_in_x(x):
    x[np.isnan(x)] = 0.0
    x[x==-999.0] = 0.0
    return x

def _zero_out_x(x):
    x = 0.0
    return x
    
def _fixME1Ring(x):
    for i in range(len(x)):
        if (x[i,0] != 0.0): x[i,18] = x[i,18] + 1
    return x   

def muon_data(filename, reg_pt_scale=1.0, reg_dxy_scale=1.0, correct_for_eta=False):
    try:
        logger.info('Loading muon data from {0} ...'.format(filename))
        loaded = np.load(filename)
        the_variables = loaded['variables']
        the_parameters = loaded['parameters']
        # print(the_variables.shape)
        the_variables = the_variables[:nentries]
        the_parameters = the_parameters[:nentries]
        logger.info('Loaded the variables with shape {0}'.format(the_variables.shape))
        logger.info('Loaded the parameters with shape {0}'.format(the_parameters.shape))
    except:
        logger.error('Failed to load data from file: {0}'.format(filename))

    assert(the_variables.shape[0] == the_parameters.shape[0])
    _handle_nan_in_x(the_variables)
      #_fixME1Ring(the_variables)
    _handle_nan_in_x(the_parameters)
    mask = np.logical_or(np.logical_or( np.logical_or((the_variables[:,23] == 11), (the_variables[:,23] == 13)), (the_variables[:,23] == 14)),(the_variables[:,23] == 15)) 

    the_variables = the_variables[mask]  
    the_parameters = the_parameters[mask]  
    assert(the_variables.shape[0] == the_parameters.shape[0])

    x = the_variables[:,0:23]
    y = reg_pt_scale*the_parameters[:,0]
#    print (x[0:30,:], the_variables[0:30,23])
#    print (y[0:30])
    phi = the_parameters[:,1] 
    eta = the_parameters[:,2] 
    vx = the_parameters[:,3] 
    vy = the_parameters[:,4] 
    vz = the_parameters[:,5]      
    dxy = vy * np.cos(phi) - vx * np.sin(phi) 
    logger.info('Loaded the encoded variables with shape {0}'.format(x.shape))
    logger.info('Loaded the encoded parameters with shape {0}'.format(y.shape))

    return x, y, dxy

def muon_data_split(filename, reg_pt_scale=1.0, reg_dxy_scale=1.0, test_size=0.5, correct_for_eta=False):
    x, y, dxy = muon_data(filename, reg_pt_scale=reg_pt_scale, reg_dxy_scale=reg_dxy_scale, correct_for_eta=correct_for_eta)

    # Split dataset in training and testing
    x_train, x_test, y_train, y_test, dxy_train, dxy_test = train_test_split(x, y, dxy, test_size=test_size)
    logger.info('Loaded # of training and testing events: {0}'.format((x_train.shape[0], x_test.shape[0])))

    # Check for cases where the number of events in the last batch could be too few
    validation_split = 0.1
    train_num_samples = int(x_train.shape[0] * (1.0-validation_split))
    val_num_samples = x_train.shape[0] - train_num_samples
    batch_size = 128
    if (train_num_samples%batch_size) < 100:
        logger.warning('The last batch for training could be too few! ({0}%{1})={2}. Please change test_size.'.format(train_num_samples, batch_size, train_num_samples%batch_size))
        logger.warning('Try this formula: int(int({0}*{1})*{2}) % 128'.format(x.shape[0], 1.0-test_size, 1.0-validation_split))
    train_num_samples = int(x_train.shape[0] * 2 * (1.0-validation_split))
    val_num_samples = x_train.shape[0] - train_num_samples
    batch_size = 128
    if (train_num_samples%batch_size) < 100:
        logger.warning('The last batch for training after mixing could be too few! ({0}%{1})={2}. Please change test_size.'.format(train_num_samples, batch_size, train_num_samples%batch_size))
        logger.warning('Try this formula: int(int({0}*{1})*2*{2}) % 128'.format(x.shape[0], 1.0-test_size, 1.0-validation_split))
    return x_train, x_test, y_train, y_test, dxy_train, dxy_test

In [4]:
# Import muon data
# 'x' is the array of input variables, 'y' is the q/pT
x_train_displ, x_test_displ, y_train_displ, y_test_displ, dxy_train_displ, dxy_test_displ= muon_data_split(infile_muon_displ, 
                                                                                                           reg_pt_scale=reg_pt_scale, 
                                                                                                           reg_dxy_scale=reg_dxy_scale, 
                                                                                                           test_size=0.315)

y_train_displ = np.abs(y_train_displ)
y_test_displ = np.abs(y_test_displ)


[INFO    ] Loading muon data from ./data/NN_input_params_FlatXYZ.npz ...
[INFO    ] Loaded the variables with shape (19300000, 25)
[INFO    ] Loaded the parameters with shape (19300000, 6)
[INFO    ] Loaded the encoded variables with shape (3284620, 23)
[INFO    ] Loaded the encoded parameters with shape (3284620,)
[INFO    ] Loaded # of training and testing events: (2249964, 1034656)


In [5]:
from keras.models import Model
from keras.regularizers import L1L2
from tensorflow.keras.optimizers import Adam
from keras.layers import Input, BatchNormalization, Dense, Activation
from nn_evaluate import huber_loss
from nn_training import train_model, lr_schedule
from keras.callbacks import LearningRateScheduler, TerminateOnNaN, ModelCheckpoint

In [6]:
def create_model(nvariables, lr=0.001, clipnorm=10., initializer = "glorot_uniform",
                nodes1=64, nodes2=32, nodes3=16, outnodes=2,
                l1_reg = 0.0, l2_reg = 0.0):
  
    regularizer = L1L2(l1=l1_reg, l2=l2_reg)
    bn_momentum = 0.9
    eps = 1e-4

    x = x_in = Input((nvariables,))
    x = BatchNormalization(epsilon=eps, momentum=bn_momentum,name="bn-input")(x)
    
    x = Dense(nodes1, 
               kernel_initializer=initializer,
               use_bias = False,
               kernel_regularizer = regularizer,
               name="hidden-dense-1")(x)
    x = BatchNormalization(epsilon = eps, momentum  = bn_momentum, name = "bn-1")(x)
    x = Activation(activation = "tanh",name="act_1")(x)
    
    if nodes2:
    
        x = Dense(nodes2, 
                   kernel_initializer=initializer,
                   use_bias = False,
                   kernel_regularizer = regularizer,
                   name="hidden-dense-2")(x)
        x = BatchNormalization(epsilon = eps, momentum  = bn_momentum, name = "bn-2")(x)
        x = Activation(activation = "tanh",name="act_2")(x)
        if nodes3:

            x = Dense(nodes3, 
                       kernel_initializer=initializer,
                       kernel_regularizer = regularizer,
                       use_bias = False,
                       name="hidden-dense-3")(x)
            x = BatchNormalization(epsilon = eps, momentum  = bn_momentum, name = "bn-3")(x)
            x = Activation(activation = "tanh", name="act_3")(x)

    x = Dense(outnodes,kernel_initializer = initializer,name="dense-output")(x)
    x = Activation("linear")(x)
    
    model = Model(inputs=x_in, outputs=x,name="baseline-model")
    
    adam = Adam(lr=lr, clipnorm=clipnorm)
    model.compile(optimizer=adam, 
                  loss=huber_loss, 
                  metrics=['acc','mse','mae'])
    model.summary()
    
    return model
    

In [7]:
assert(keras.backend.backend() == 'tensorflow')

normal_epochs = 300
normal_batch_size = 2000
l1_reg = 0.0
l2_reg = 0.0
learning_rate = 0.0063
gradient_clip_norm = 100.

lr_decay = LearningRateScheduler(lr_schedule, verbose=1)
terminate_on_nan = TerminateOnNaN()
model_training_checkpoint = ModelCheckpoint(monitor="val_loss", 
                                           verbose = 1,
                                           filepath = "checkpoints/model_ckpt_epoch_{epoch:02d}.hdf5",
                                           period = 5)
model = create_model(
                    nvariables = nvariables, 
                    lr = learning_rate, 
                    clipnorm = gradient_clip_norm, 
                    nodes1=20, 
                    nodes2=15, 
                    nodes3=10, 
                    outnodes=2,
                    l1_reg = l1_reg, 
                    l2_reg = l2_reg)

logger.info('Training model with l1_reg: {0} l2_reg: {1}'.format(l1_reg, l2_reg))

model, history = train_model(model, 
                      x_train_displ, 
                      np.column_stack((y_train_displ, dxy_train_displ)),
                      save_model=False, 
                      epochs=normal_epochs, 
                      batch_size=normal_batch_size,
                      callbacks=[lr_decay,terminate_on_nan, model_training_checkpoint], 
                      validation_split=0.1, 
                      verbose=True)

metrics = [len(history.history['loss']), history.history['loss'][-1], history.history['val_loss'][-1]]
logger.info('Epoch {0}/{0} - loss: {1} - val_loss: {2}'.format(*metrics))

2021-12-14 11:04:01.419570: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-14 11:04:01.420367: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
[INFO    ] Training model with l1_reg: 0.0 l2_reg: 0.0
[INFO    ] Begin training ...


Metal device set to: Apple M1
Model: "baseline-model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 23)]              0         
_________________________________________________________________
bn-input (BatchNormalization (None, 23)                92        
_________________________________________________________________
hidden-dense-1 (Dense)       (None, 20)                460       
_________________________________________________________________
bn-1 (BatchNormalization)    (None, 20)                80        
_________________________________________________________________
act_1 (Activation)           (None, 20)                0         
_________________________________________________________________
hidden-dense-2 (Dense)       (None, 15)                300       
_________________________________________________________________
bn-2 (BatchNormalizati

2021-12-14 11:04:01.640755: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-14 11:04:01.643887: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/300

Epoch 00001: LearningRateScheduler setting learning rate to 0.006300000008195639.


2021-12-14 11:04:01.994338: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-12-14 11:04:16.645959: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/300

Epoch 00002: LearningRateScheduler setting learning rate to 0.006300000008195639.
Epoch 3/300

Epoch 00003: LearningRateScheduler setting learning rate to 0.006300000008195639.
Epoch 4/300

Epoch 00004: LearningRateScheduler setting learning rate to 0.006300000008195639.
Epoch 5/300

Epoch 00005: LearningRateScheduler setting learning rate to 0.006300000008195639.

Epoch 00005: saving model to checkpoints/model_ckpt_epoch_05.hdf5
Epoch 6/300

Epoch 00006: LearningRateScheduler setting learning rate to 0.006300000008195639.
Epoch 7/300

Epoch 00007: LearningRateScheduler setting learning rate to 0.006300000008195639.
Epoch 8/300

Epoch 00008: LearningRateScheduler setting learning rate to 0.006300000008195639.
Epoch 9/300

Epoch 00009: LearningRateScheduler setting learning rate to 0.006300000008195639.
Epoch 10/300

Epoch 00010: LearningRateScheduler setting learning rate to 0.006300000008195639.

Epoch 00010: saving model to checkpoints/model_ckpt_epoch_10.hdf5
Epoch 11/3


Epoch 00055: saving model to checkpoints/model_ckpt_epoch_55.hdf5
Epoch 56/300

Epoch 00056: LearningRateScheduler setting learning rate to 0.0037200867664068937.
Epoch 57/300

Epoch 00057: LearningRateScheduler setting learning rate to 0.0037200867664068937.
Epoch 58/300

Epoch 00058: LearningRateScheduler setting learning rate to 0.0037200867664068937.
Epoch 59/300

Epoch 00059: LearningRateScheduler setting learning rate to 0.0037200867664068937.
Epoch 60/300

Epoch 00060: LearningRateScheduler setting learning rate to 0.0037200867664068937.

Epoch 00060: saving model to checkpoints/model_ckpt_epoch_60.hdf5
Epoch 61/300

Epoch 00061: LearningRateScheduler setting learning rate to 0.0033480780897662044.
Epoch 62/300

Epoch 00062: LearningRateScheduler setting learning rate to 0.0033480781130492687.
Epoch 63/300

Epoch 00063: LearningRateScheduler setting learning rate to 0.0033480781130492687.
Epoch 64/300

Epoch 00064: LearningRateScheduler setting learning rate to 0.00334807811304

Epoch 83/300

Epoch 00083: LearningRateScheduler setting learning rate to 0.002711943117901683.
Epoch 84/300

Epoch 00084: LearningRateScheduler setting learning rate to 0.002711943117901683.
Epoch 85/300

Epoch 00085: LearningRateScheduler setting learning rate to 0.002711943117901683.

Epoch 00085: saving model to checkpoints/model_ckpt_epoch_85.hdf5
Epoch 86/300

Epoch 00086: LearningRateScheduler setting learning rate to 0.002711943117901683.
Epoch 87/300

Epoch 00087: LearningRateScheduler setting learning rate to 0.002711943117901683.
Epoch 88/300

Epoch 00088: LearningRateScheduler setting learning rate to 0.002711943117901683.
Epoch 89/300

Epoch 00089: LearningRateScheduler setting learning rate to 0.002711943117901683.
Epoch 90/300

Epoch 00090: LearningRateScheduler setting learning rate to 0.002711943117901683.

Epoch 00090: saving model to checkpoints/model_ckpt_epoch_90.hdf5
Epoch 91/300

Epoch 00091: LearningRateScheduler setting learning rate to 0.0024407488061115147.
E

Epoch 137/300

Epoch 00137: LearningRateScheduler setting learning rate to 0.0016013751737773418.
Epoch 138/300

Epoch 00138: LearningRateScheduler setting learning rate to 0.0016013751737773418.
Epoch 139/300

Epoch 00139: LearningRateScheduler setting learning rate to 0.0016013751737773418.
Epoch 140/300

Epoch 00140: LearningRateScheduler setting learning rate to 0.0016013751737773418.

Epoch 00140: saving model to checkpoints/model_ckpt_epoch_140.hdf5
Epoch 141/300

Epoch 00141: LearningRateScheduler setting learning rate to 0.0014412376563996078.
Epoch 142/300

Epoch 00142: LearningRateScheduler setting learning rate to 0.0014412376331165433.
Epoch 143/300

Epoch 00143: LearningRateScheduler setting learning rate to 0.0014412376331165433.
Epoch 144/300

Epoch 00144: LearningRateScheduler setting learning rate to 0.0014412376331165433.
Epoch 145/300

Epoch 00145: LearningRateScheduler setting learning rate to 0.0014412376331165433.

Epoch 00145: saving model to checkpoints/model_ck

Epoch 164/300

Epoch 00164: LearningRateScheduler setting learning rate to 0.0011674024863168597.
Epoch 165/300

Epoch 00165: LearningRateScheduler setting learning rate to 0.0011674024863168597.

Epoch 00165: saving model to checkpoints/model_ckpt_epoch_165.hdf5
Epoch 166/300

Epoch 00166: LearningRateScheduler setting learning rate to 0.0011674024863168597.
Epoch 167/300

Epoch 00167: LearningRateScheduler setting learning rate to 0.0011674024863168597.
Epoch 168/300

Epoch 00168: LearningRateScheduler setting learning rate to 0.0011674024863168597.
Epoch 169/300

Epoch 00169: LearningRateScheduler setting learning rate to 0.0011674024863168597.
Epoch 170/300

Epoch 00170: LearningRateScheduler setting learning rate to 0.0011674024863168597.

Epoch 00170: saving model to checkpoints/model_ckpt_epoch_170.hdf5
Epoch 171/300

Epoch 00171: LearningRateScheduler setting learning rate to 0.0010506622376851738.
Epoch 172/300

Epoch 00172: LearningRateScheduler setting learning rate to 0.001

Epoch 218/300

Epoch 00218: LearningRateScheduler setting learning rate to 0.0006893394747748971.
Epoch 219/300

Epoch 00219: LearningRateScheduler setting learning rate to 0.0006893394747748971.
Epoch 220/300

Epoch 00220: LearningRateScheduler setting learning rate to 0.0006893394747748971.

Epoch 00220: saving model to checkpoints/model_ckpt_epoch_220.hdf5
Epoch 221/300

Epoch 00221: LearningRateScheduler setting learning rate to 0.0006204055272974074.
Epoch 222/300

Epoch 00222: LearningRateScheduler setting learning rate to 0.0006204055389389396.
Epoch 223/300

Epoch 00223: LearningRateScheduler setting learning rate to 0.0006204055389389396.
Epoch 224/300

Epoch 00224: LearningRateScheduler setting learning rate to 0.0006204055389389396.
Epoch 225/300

Epoch 00225: LearningRateScheduler setting learning rate to 0.0006204055389389396.

Epoch 00225: saving model to checkpoints/model_ckpt_epoch_225.hdf5
Epoch 226/300

Epoch 00226: LearningRateScheduler setting learning rate to 0.000

Epoch 245/300

Epoch 00245: LearningRateScheduler setting learning rate to 0.0005025284481234848.

Epoch 00245: saving model to checkpoints/model_ckpt_epoch_245.hdf5
Epoch 246/300

Epoch 00246: LearningRateScheduler setting learning rate to 0.0005025284481234848.
Epoch 247/300

Epoch 00247: LearningRateScheduler setting learning rate to 0.0005025284481234848.
Epoch 248/300

Epoch 00248: LearningRateScheduler setting learning rate to 0.0005025284481234848.
Epoch 249/300

Epoch 00249: LearningRateScheduler setting learning rate to 0.0005025284481234848.
Epoch 250/300

Epoch 00250: LearningRateScheduler setting learning rate to 0.0005025284481234848.

Epoch 00250: saving model to checkpoints/model_ckpt_epoch_250.hdf5
Epoch 251/300

Epoch 00251: LearningRateScheduler setting learning rate to 0.0004522756033111364.
Epoch 252/300

Epoch 00252: LearningRateScheduler setting learning rate to 0.0004522755916696042.
Epoch 253/300

Epoch 00253: LearningRateScheduler setting learning rate to 0.000

Epoch 272/300

Epoch 00272: LearningRateScheduler setting learning rate to 0.0003663432435132563.
Epoch 273/300

Epoch 00273: LearningRateScheduler setting learning rate to 0.0003663432435132563.
Epoch 274/300

Epoch 00274: LearningRateScheduler setting learning rate to 0.0003663432435132563.
Epoch 275/300

Epoch 00275: LearningRateScheduler setting learning rate to 0.0003663432435132563.

Epoch 00275: saving model to checkpoints/model_ckpt_epoch_275.hdf5
Epoch 276/300

Epoch 00276: LearningRateScheduler setting learning rate to 0.0003663432435132563.
Epoch 277/300

Epoch 00277: LearningRateScheduler setting learning rate to 0.0003663432435132563.
Epoch 278/300

Epoch 00278: LearningRateScheduler setting learning rate to 0.0003663432435132563.
Epoch 279/300

Epoch 00279: LearningRateScheduler setting learning rate to 0.0003663432435132563.
Epoch 280/300

Epoch 00280: LearningRateScheduler setting learning rate to 0.0003663432435132563.

Epoch 00280: saving model to checkpoints/model_ck

Epoch 299/300

Epoch 00299: LearningRateScheduler setting learning rate to 0.0002967380278278142.
Epoch 300/300

Epoch 00300: LearningRateScheduler setting learning rate to 0.0002967380278278142.


[INFO    ] Done training. Time elapsed: 1:16:55.644935 sec
[INFO    ] Epoch 300/300 - loss: 9.616619110107422 - val_loss: 9.491670608520508



Epoch 00300: saving model to checkpoints/model_ckpt_epoch_300.hdf5


In [8]:
import glob
import re
from keras.models import load_model
filenames = glob.glob("checkpoints/model_ckpt_*.hdf5")
files = sorted(filenames, key=lambda x:float(re.findall("(\d+)",x)[0]))
files[:10]

['checkpoints/model_ckpt_epoch_05.hdf5',
 'checkpoints/model_ckpt_epoch_10.hdf5',
 'checkpoints/model_ckpt_epoch_15.hdf5',
 'checkpoints/model_ckpt_epoch_20.hdf5',
 'checkpoints/model_ckpt_epoch_25.hdf5',
 'checkpoints/model_ckpt_epoch_30.hdf5',
 'checkpoints/model_ckpt_epoch_35.hdf5',
 'checkpoints/model_ckpt_epoch_40.hdf5',
 'checkpoints/model_ckpt_epoch_45.hdf5',
 'checkpoints/model_ckpt_epoch_50.hdf5']

In [9]:
# import glob
# import os

# list_of_files = glob.glob('checkpoints/model_ckpt_*.hdf5') # * means all if need specific format then *.csv
# latest_file = max(list_of_files, key=os.path.getctime)
# print(latest_file)

checkpoints/model_ckpt_epoch_300.hdf5


In [16]:
idxs = [i for i in range(5,305,5)]
maxs, mins, abs_mins, negs, pos, var = [], [], [],[], [], []
for i in range(len(files)):
    loaded_model = load_model(filepath = files[i])   
    for layer in loaded_model.layers:
        if layer.name == "dense-output":
            w = layer.get_weights()[0]
            maxs.append(np.max(w))
            mins.append(np.min(w))
            abs_mins.append(abs(np.min(w)))
            negs.append(np.sum(w < 0))
            pos.append(np.sum(w >= 0))
            var.append(np.var(w))

In [17]:
df = pd.DataFrame([maxs,mins,abs_mins,var,negs,pos])
df = df.transpose()
df.columns=["max","min","abs_min","variance","negs","pos"]
df.index = idxs
df
df.to_excel("dense-output.xlsx",index=True, index_label = "epoch")