In [1]:
from __future__ import absolute_import, division, print_function
from tensorflow.keras.layers import Lambda, Dense, Input, Layer, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback
from tensorflow.keras.initializers import Constant
from tensorflow.keras.backend import concatenate
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K


import gc
import argparse


In [2]:
gpus = tf.config.list_physical_devices('GPU')
for gpu in tf.config.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(gpu, True)
tf.config.set_visible_devices(gpus[0], 'GPU')

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


2024-06-12 09:04:34.494886: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-12 09:04:34.510641: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-12 09:04:34.510910: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [3]:
# standard numerical library imports
import numpy as np
import math
from math import *
import scipy as sp
import matplotlib.pyplot as plt
import os

# energyflow imports
import energyflow as ef
from energyflow.archs import PFN
from energyflow.utils import data_split, to_categorical


In [4]:
# Global plot settings
from matplotlib import rc
import matplotlib.font_manager
rc('font', family='serif')
rc('text', usetex=False)
rc('font', size=22)
rc('xtick', labelsize=15)
rc('ytick', labelsize=15)
rc('legend', fontsize=15)


pythia_text = r'$POWHEG \; (hvq) \; pp \to  t\bar{t}$'
def make_legend(ax, title):
    leg = ax.legend(frameon=False)
    leg.set_title(title, prop={'size':20})
    leg.texts[0].set_fontsize(20)
    leg._legend_box.align = "left"
    plt.tight_layout()


import mplhep as hep
plt.style.use(hep.style.CMS)


In [5]:
# define training iter function and helpful setup_nn function

def setup_nn(Phi_sizes = (100,100,128), F_sizes = (100,100,100), input_dim=6, patience = 15, save_label = 'DCTR_training', out_dir = './saved_models'):

    dctr = PFN(input_dim = input_dim,
               Phi_sizes = Phi_sizes, 
               F_sizes   = F_sizes,
               summary   = False)

    os.makedirs(os.path.dirname(f'{out_dir}/{save_label}.h5'), exist_ok=True) # create output dir, if it doesn't exist
    
    checkpoint = keras.callbacks.ModelCheckpoint(f'{out_dir}/{save_label}.h5',
                                                    monitor='val_loss',
                                                    verbose=2,
                                                    save_best_only=True,
                                                    mode='min')
    
    CSVLogger = keras.callbacks.CSVLogger(f'{out_dir}/{save_label}_loss.csv', append=False)
    
    EarlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  min_delta=0,
                                                  patience=patience,
                                                  verbose=1,
                                                  restore_best_weights=True)
    
    callbacks = [checkpoint, CSVLogger, EarlyStopping]

    return dctr, callbacks


def train_single_iteration(X0, X1, iteration, num_events = int(4e7), batch_size = (5*8192), save_label = 'DCTR_training', out_dir = './saved_models'):
    # take random num_evts from each dataset
    rand0 = np.random.choice(len(X0), num_events) # num_evts random indices
    rand1 = np.random.choice(len(X1), num_events)

    local_X0 = X0[rand0]
    local_X1 = X1[rand1]

    # create classifier array Y
    Y0 = np.array([0. for i in range(local_X0.shape[0])])
    Y1 = np.array([1. for i in range(local_X1.shape[0])])

    Y = np.concatenate((Y0, Y1))
    Y = to_categorical(Y, num_classes=2)

    # create training array
    X = []
    X = np.concatenate((local_X0, local_X1))
    
    X_train, X_val, Y_train, Y_val = data_split(X, Y, test=0.25, shuffle=True)

    del rand0, rand1, local_X0, local_X1, X, Y # delete the tmp variables to free memory
    gc.collect() # collect garbage to free memory

    # ready to start training
    dctr, callbacks = setup_nn(save_label = f'{save_label}_iter_{iteration:02d}', out_dir = out_dir)

    print('Starting training')
    history = dctr.fit(X_train, Y_train,
                       epochs = 1000,
                       batch_size = batch_size,
                       validation_data = (X_val, Y_val),
                       verbose = 1,
                       callbacks = callbacks)
    
    dctr.save(f'{out_dir}/{save_label}_iter_{iteration:02d}.h5')
    
    plt.figure(figsize=(6,5))
    plt.plot(history.history['loss'],     label = 'loss')
    plt.plot(history.history['val_loss'], label = 'val loss')
    plt.legend(loc=0)
    plt.ylabel('loss')
    plt.xlabel('Epochs')
    plt.savefig(f'{out_dir}/{save_label}_iter_{iteration:02d}_history.pdf')

    del history, dctr, callbacks, X_train, X_val, Y_train, Y_val # delete vars to free memory
    K.clear_session()
    gc.collect() # collect garbage to free memory


In [6]:
data_dir = '/tf/data/BachelorThesis_Data/Valentinas_Samples'


In [7]:
dataset_up = np.load(f'{data_dir}/test_samples_ready/X1_Train_13TeV_2.305_plot.npz')# hdamp value
print(dataset_up['a'][:10,0,:])

print(np.min(dataset_up['a'][:,0,:], axis = 0))
print(np.max(dataset_up['a'][:,0,:], axis = 0))


[[ 2.08148676  0.6992826  -0.59109546  0.70764628  0.1         1.379     ]
 [ 2.13661    -1.70731505  2.28662851  0.70556868  0.1         1.379     ]
 [ 2.16186714  2.03290375  1.99770953  0.7091455   0.1         1.379     ]
 [ 1.94549652  1.31229885  2.85453456  0.70742535  0.1         1.379     ]
 [ 2.11433274 -1.11956486 -2.14938315  0.70351428  0.1         1.379     ]
 [ 1.65173211 -1.6161456   0.87467413  0.7040604   0.1         1.379     ]
 [ 2.08502436 -1.30460162  1.77656825  0.70665806  0.1         1.379     ]
 [ 2.43647629  1.33334723 -1.61424266  0.7044473   0.1         1.379     ]
 [ 2.46671257 -0.5252575   0.54416391  0.71121148  0.1         1.379     ]
 [ 2.26424266  1.37205437  2.91405565  0.70220938  0.1         1.379     ]]
[-1.7948258  -4.17589557 -3.14159232  0.32964137  0.1         1.379     ]
[3.33542968 4.20943493 3.14159231 1.         0.1        1.379     ]


In [8]:
dataset_nom = np.load(f'{data_dir}/test_samples_ready/X0_Train_13TeV_1.379_plot.npz')
print(dataset_nom['a'][:10,0,:])

print(np.min(dataset_nom['a'][:,0,:], axis = 0))
print(np.max(dataset_nom['a'][:,0,:], axis = 0))



[[ 2.08036261  0.70378566 -0.59099605  0.70764629  0.1         1.379     ]
 [ 2.03421331 -1.7585162   2.25428646  0.70556868  0.1         1.379     ]
 [ 2.16052334  2.04118263  1.99775585  0.7091455   0.1         1.379     ]
 [ 2.11038967  1.36066105 -2.04403781  0.7001572   0.1         1.379     ]
 [ 1.90231044  0.94453788 -2.28350418  0.70036184  0.1         1.379     ]
 [ 2.09914199 -0.24509181 -2.2960895   0.72042176  0.1         1.379     ]
 [ 1.18093296  1.51109212 -2.6413216   0.72837334  0.1         1.379     ]
 [ 2.12871994 -0.24745518  2.03932809  0.79072319  0.1         1.379     ]
 [ 2.08599142 -3.03056926 -0.21397835  0.71161655  0.1         1.379     ]
 [ 1.95338316 -0.96071901  0.43494603  0.70625258  0.1         1.379     ]]
[-1.73507839 -4.13107874 -3.14159251  0.32964137  0.1         1.379     ]
[3.43662793 4.14941193 3.14159258 1.         0.1        1.379     ]


In [19]:
del dataset_up, dataset_nom


In [7]:
# reverse engineering normalization function
# pt -> np.log10(pt) ?
# rapidity and phi aren't touched
# mass = mass / max(mass) ?
# pid: top:  6->0.1
#     atop: -6->0.2

# hdamp_val: 1.379

def normalize(X):
    X[:,0,0] = np.log10(X[:,0,0]) # log pt
    X[:,1,0] = np.log10(X[:,1,0])
    
    X[:,0,3] = X[:,0,3]/np.max(X[:,0,3]) # mass
    X[:,1,3] = X[:,1,3]/np.max(X[:,1,3]) 
    
    X[:,0,4] = 0.1 # pid
    X[:,1,4] = 0.2
    
    X[:,0,5] = 1.379 # hdamp
    X[:,1,5] = 1.379

    return X


In [7]:
# convert datasets to normed

data_dir     = '/tf/data/BachelorThesis_Data/Valentinas_Samples'
'''
nominal_list = []
up_list      = []

# X0: up
# X1: nominal

# shape of the datasets in valentinas script was (:, 2, 6)
# shape of the datasets passed to me are         (:, 3, 6)
# so removing 3. entry to be left with only top and anti-top


for i in range(1, 51):
    X0 = []
    X1 = []
    X0_nrm = []
    X1_nrm = []
    
    X0 = np.load(f'{data_dir}/up/Results{i}/2MeventsTrain_2.305_seed{i}_Base1000_13TeV_P4.npz')['a'][:,:-1,:]    
    X1 = np.load(f'{data_dir}/nominal/Results{i}/2MeventsTrain_1.379_seed{i}_Base1000_13TeV_P4.npz')['a'][:,:-1,:] # removing jet

    X0_nrm = np.array(normalize(X0))
    X1_nrm = np.array(normalize(X1))

    np.save(f'{data_dir}/up/Results{i}/2MeventsTrain_2.305_seed{i}_Base1000_13TeV_P4_normed.npy', X0_nrm)
    np.save(f'{data_dir}/nominal/Results{i}/2MeventsTrain_1.379_seed{i}_Base1000_13TeV_P4_normed.npy', X1_nrm)

    del X0, X1, X0_nrm, X1_nrm
    gc.collect()
    
    print(f'loaded and normalized nominal and up datasets: \n\
        up to Results{i}')
'''


loaded and normalized nominal and up datasets: 
        up to Results1
loaded and normalized nominal and up datasets: 
        up to Results2
loaded and normalized nominal and up datasets: 
        up to Results3
loaded and normalized nominal and up datasets: 
        up to Results4
loaded and normalized nominal and up datasets: 
        up to Results5
loaded and normalized nominal and up datasets: 
        up to Results6
loaded and normalized nominal and up datasets: 
        up to Results7
loaded and normalized nominal and up datasets: 
        up to Results8
loaded and normalized nominal and up datasets: 
        up to Results9
loaded and normalized nominal and up datasets: 
        up to Results10
loaded and normalized nominal and up datasets: 
        up to Results11
loaded and normalized nominal and up datasets: 
        up to Results12
loaded and normalized nominal and up datasets: 
        up to Results13
loaded and normalized nominal and up datasets: 
        up to Results14
l

In [8]:
# load normed datasets

data_dir     = '/tf/data/BachelorThesis_Data/Valentinas_Samples'

nominal_list = []
up_list      = []

# X0: up
# X1: nominal
X0_nrm = []
X1_nrm = []


for i in range(1, 51):
    X0_nrm.extend(np.load(f'{data_dir}/up/Results{i}/2MeventsTrain_2.305_seed{i}_Base1000_13TeV_P4_normed.npy'))
    X1_nrm.extend(np.load(f'{data_dir}/nominal/Results{i}/2MeventsTrain_1.379_seed{i}_Base1000_13TeV_P4_normed.npy'))
    if not i%5: 
        print(f'loaded and added normalized nominal and up datasets: \n\
            up to Results{i}')

X0_nrm = np.array(X0_nrm)
X1_nrm = np.array(X1_nrm)

print(f'{X0_nrm.shape}')
print(f'{X1_nrm.shape}')

gc.collect()


loaded and added normalized nominal and up datasets: 
            up to Results5
loaded and added normalized nominal and up datasets: 
            up to Results10
loaded and added normalized nominal and up datasets: 
            up to Results15
loaded and added normalized nominal and up datasets: 
            up to Results20
loaded and added normalized nominal and up datasets: 
            up to Results25
loaded and added normalized nominal and up datasets: 
            up to Results30
loaded and added normalized nominal and up datasets: 
            up to Results35
loaded and added normalized nominal and up datasets: 
            up to Results40
loaded and added normalized nominal and up datasets: 
            up to Results45
loaded and added normalized nominal and up datasets: 
            up to Results50
(99989274, 2, 6)
(99053382, 2, 6)


682

In [18]:
# cheack datasets
'''
for part in range(2):
    for arg in range(6):
        print(f'{np.mean(X0_nrm[:, part, arg]) = :.3f}')
        print(f'{np.mean(X1_nrm[:, part, arg]) = :.3f}')
'''


np.mean(X0_nrm[:, part, arg]) = 1.997
np.mean(X1_nrm[:, part, arg]) = 1.996
np.mean(X0_nrm[:, part, arg]) = -0.000
np.mean(X1_nrm[:, part, arg]) = 0.000
np.mean(X0_nrm[:, part, arg]) = 0.000
np.mean(X1_nrm[:, part, arg]) = 0.000
np.mean(X0_nrm[:, part, arg]) = 0.705
np.mean(X1_nrm[:, part, arg]) = 0.705
np.mean(X0_nrm[:, part, arg]) = 0.100
np.mean(X1_nrm[:, part, arg]) = 0.100
np.mean(X0_nrm[:, part, arg]) = 1.379
np.mean(X1_nrm[:, part, arg]) = 1.379
np.mean(X0_nrm[:, part, arg]) = 1.998
np.mean(X1_nrm[:, part, arg]) = 1.997
np.mean(X0_nrm[:, part, arg]) = -0.000
np.mean(X1_nrm[:, part, arg]) = 0.000
np.mean(X0_nrm[:, part, arg]) = -0.000
np.mean(X1_nrm[:, part, arg]) = 0.000
np.mean(X0_nrm[:, part, arg]) = 0.705
np.mean(X1_nrm[:, part, arg]) = 0.705
np.mean(X0_nrm[:, part, arg]) = 0.200
np.mean(X1_nrm[:, part, arg]) = 0.200
np.mean(X0_nrm[:, part, arg]) = 1.379
np.mean(X1_nrm[:, part, arg]) = 1.379


In [None]:
# train 20 iterations and save the model each time using the predefined functions
# taking 40 million random samples from each dataset (containing 99 million + events) for each iteration

for i in range(1, 21):
    train_single_iteration(X0_nrm, X1_nrm, iteration=i)
    


2024-06-12 09:09:01.866445: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-12 09:09:01.870044: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-12 09:09:01.870559: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-12 09:09:01.870769: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

Starting training


2024-06-12 09:09:02.795696: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2880000000 exceeds 10% of free system memory.
2024-06-12 09:09:03.530714: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2880000000 exceeds 10% of free system memory.


Epoch 1/1000
Epoch 1: val_loss improved from inf to 0.69323, saving model to ./saved_models/DCTR_training_iter_01.h5
Epoch 2/1000
Epoch 2: val_loss improved from 0.69323 to 0.69309, saving model to ./saved_models/DCTR_training_iter_01.h5
Epoch 3/1000
Epoch 3: val_loss improved from 0.69309 to 0.69303, saving model to ./saved_models/DCTR_training_iter_01.h5
Epoch 4/1000
Epoch 4: val_loss did not improve from 0.69303
Epoch 5/1000
Epoch 5: val_loss improved from 0.69303 to 0.69302, saving model to ./saved_models/DCTR_training_iter_01.h5
Epoch 6/1000
Epoch 6: val_loss did not improve from 0.69302
Epoch 7/1000
Epoch 7: val_loss improved from 0.69302 to 0.69302, saving model to ./saved_models/DCTR_training_iter_01.h5
Epoch 8/1000
Epoch 8: val_loss improved from 0.69302 to 0.69298, saving model to ./saved_models/DCTR_training_iter_01.h5
Epoch 9/1000
Epoch 9: val_loss improved from 0.69298 to 0.69298, saving model to ./saved_models/DCTR_training_iter_01.h5
Epoch 10/1000
Epoch 10: val_loss did 

2024-06-12 09:24:43.614654: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2880000000 exceeds 10% of free system memory.
2024-06-12 09:24:44.294126: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2880000000 exceeds 10% of free system memory.


Epoch 1/1000
Epoch 1: val_loss improved from inf to 0.69316, saving model to ./saved_models/DCTR_training_iter_02.h5
Epoch 2/1000
Epoch 2: val_loss improved from 0.69316 to 0.69309, saving model to ./saved_models/DCTR_training_iter_02.h5
Epoch 3/1000
Epoch 3: val_loss improved from 0.69309 to 0.69304, saving model to ./saved_models/DCTR_training_iter_02.h5
Epoch 4/1000
Epoch 4: val_loss did not improve from 0.69304
Epoch 5/1000
Epoch 5: val_loss improved from 0.69304 to 0.69300, saving model to ./saved_models/DCTR_training_iter_02.h5
Epoch 6/1000
Epoch 6: val_loss did not improve from 0.69300
Epoch 7/1000
Epoch 7: val_loss improved from 0.69300 to 0.69299, saving model to ./saved_models/DCTR_training_iter_02.h5
Epoch 8/1000
Epoch 8: val_loss improved from 0.69299 to 0.69298, saving model to ./saved_models/DCTR_training_iter_02.h5
Epoch 9/1000
Epoch 9: val_loss did not improve from 0.69298
Epoch 10/1000
Epoch 10: val_loss improved from 0.69298 to 0.69297, saving model to ./saved_models/

2024-06-12 09:41:57.822765: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2880000000 exceeds 10% of free system memory.


Epoch 1/1000
Epoch 1: val_loss improved from inf to 0.69314, saving model to ./saved_models/DCTR_training_iter_03.h5
Epoch 2/1000
Epoch 2: val_loss improved from 0.69314 to 0.69310, saving model to ./saved_models/DCTR_training_iter_03.h5
Epoch 3/1000
Epoch 3: val_loss improved from 0.69310 to 0.69304, saving model to ./saved_models/DCTR_training_iter_03.h5
Epoch 4/1000
Epoch 4: val_loss improved from 0.69304 to 0.69303, saving model to ./saved_models/DCTR_training_iter_03.h5
Epoch 5/1000
Epoch 5: val_loss improved from 0.69303 to 0.69302, saving model to ./saved_models/DCTR_training_iter_03.h5
Epoch 6/1000
Epoch 6: val_loss improved from 0.69302 to 0.69300, saving model to ./saved_models/DCTR_training_iter_03.h5
Epoch 7/1000
Epoch 7: val_loss did not improve from 0.69300
Epoch 8/1000
Epoch 8: val_loss improved from 0.69300 to 0.69298, saving model to ./saved_models/DCTR_training_iter_03.h5
Epoch 9/1000
Epoch 9: val_loss improved from 0.69298 to 0.69297, saving model to ./saved_models/D