# Trying to improve the model in the high p_T (top) areas that it curently struggles in

-> fit an A*exp(b*p_t) to p_t bins
-> multiply wgts by exp(b*p_T)/max(exp(b*p_T)) -> low p_t -> very low wgts ; high p_T -> weights +/- 1 
        -> Problem: most weights very very small -> large loss of statistics
            -> normalizing to mean of 1 -> some small percentage of weights get very large (>2000), but most are below 1

-> IDEA: Use 10M events with these weigths and 10M with (scaled (20%)) org weights, since we don't want to lose the low p_t accuracy but also want the network to learn these high p_T events
    For NNLO use same events with new weights, NLO use different events (since we have them)



In [1]:
# import system modules
import sys
import os
os.system('for a in /sys/bus/pci/devices/*; do echo 0 | tee -a $a/numa_node>/dev/null; done') # get rid of NUMA node warnings: https://github.com/tensorflow/tensorflow/issues/42738
import gc

# import standard numerical modules
import numpy as np
import math

# import machine learning modules
import tensorflow as tf
import keras.backend as K

gpu = tf.config.list_physical_devices('GPU') # make sure GPU usage is enabled
tf.config.experimental.set_virtual_device_configuration(gpu[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=int(7.5*1024))])
print(gpu) 

sys.path.append('../') # path th DCTR.py
import DCTR


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
from scipy.optimize import curve_fit
from scipy import stats
import matplotlib.pyplot as plt

def exponential(x, A, b):
    value = A*np.exp(np.multiply(x,b))
    return value

def get_exponential_wgts(X, part_index, arg_index, cut_off = 1000, div = 31):
    bins = np.linspace(min(X[:,part_index,arg_index]),cut_off, div)
    n, bins = np.histogram(X[:,part_index,arg_index], bins=bins)
    bin_centers = bins[:-1]+0.5*(bins[1:]-bins[:-1])
    max_n = max(n)
    where_max=np.squeeze(np.where(n == max_n))
    # print(bins[where_max])
    
    # fit exp to histogram
    p0 = (5e6, -0.015)
    popt, pcov = curve_fit(exponential, bin_centers[where_max:], n[where_max:], p0)

    # calculate the wgts as the inverse of the exponential and the max value as normalization
    # only use events within cut_off
    x_cut = [pt for pt in X[:,part_index,arg_index] if pt <= cut_off]
    
    max_exp = max(exponential(x_cut, 1, -1*popt[1]))
    
    # set wgts above cut off to 1
    wgts = []
    for i in range(len(x0_1)):
        if X[i,part_index,arg_index] >= cut_off:
            wgts.append(1)
        else:
            wgts.append(exponential(X[i,part_index,arg_index], 1, -1*popt[1])/max_exp)

    return wgts


# load data

In [44]:
# directory with pre converted lhe files as numpy arrays
data_dir = '../Data' # modify as needed


In [45]:
# Load POWHEG hvq x0 datasets
# x0_nrm for training, x0_plt and x0_plt_nrm for calculating stats used to decide which model performs best
# only contain tt-pair; every event has order: 
    # tt-pair, top, anti-top
# every particle has arguments: 
    # [pt, y, phi, mass, eta, E, PID, w, theta]
    # [0 , 1, 2  , 3   , 4  , 5, 6  , 7, 8    ]

# POWHEG hvq

# unnormalized dataset for checking which events fall into one (or more) of the categories that should get higher weights
x0_1 = []
x0_1 = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/01-02_converted_lhe.npz', i=3)[:9553938] # 9553938 num of NNLO samples
print(f'POWHEG hvq x0_1.shape:         {x0_1.shape}')

x0_2 = []
x0_2 = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/01-02_converted_lhe.npz', i=3)[int(1e7):int(1e7 + 9553938)] # 10M different samples than above
print(f'POWHEG hvq x0_2.shape:         {x0_2.shape}')

print(gc.collect()) # gabage collection


POWHEG hvq x0_1.shape:         (9553938, 3, 9)
POWHEG hvq x0_2.shape:         (9553938, 3, 9)
63331


In [46]:
# same data as above, but normalized
x0_1_nrm = []
x0_1_nrm = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/01-02_normed_converted_lhe.npz', i=3)[:9553938] 
print(f'POWHEG hvq x0_nrm_1.shape:     {x0_1_nrm.shape}')

x0_2_nrm = []
x0_2_nrm = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/01-02_normed_converted_lhe.npz', i=3)[int(1e7):int(1e7 + 9553938)] # 10M different samples than above
print(f'POWHEG hvq x0_nrm_1.shape:     {x0_2_nrm.shape}')

print(gc.collect()) # gabage collection


POWHEG hvq x0_nrm_1.shape:     (9553938, 3, 9)
POWHEG hvq x0_nrm_1.shape:     (9553938, 3, 9)
22


In [47]:
# MiNNLO x1
# training data
x1_1_nrm = []
x1_1_nrm = DCTR.load_dataset(f'{data_dir}/MiNNLO/converted_with_13TeV_NLO/normed_converted_lhe.npz', i=3)
print(f'MiNNLO x1_1_nrm.shape: {x1_1_nrm.shape}')

x1_2_nrm = x1_1_nrm.copy()
print(f'MiNNLO x1_2_nrm.shape: {x1_2_nrm.shape}')

print(gc.collect()) # gabage collection


MiNNLO x1_1_nrm.shape: (9553938, 3, 9)
MiNNLO x1_2_nrm.shape: (9553938, 3, 9)
11


In [48]:
# plotting datasets (for calculating stats during super epoch)

# POWHEG hvq
# plotting data; different from training data
x0_plt = []
x0_plt = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/03-04_converted_lhe.npz', i=3)[:9553938]
print(f'POWHEG hvq x0_plt.shape:     {x0_plt.shape}')


x0_plt_nrm = [] # nrm data for calculating rwgt in statistics phase of super epoch
x0_plt_nrm = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/03-04_normed_converted_lhe.npz', i=3)[:9553938]
print(f'POWHEG hvq x0_plt_nrm.shape: {x0_plt_nrm.shape}')


# MiNNLO
x1_plt = []
x1_plt = DCTR.load_dataset(f'{data_dir}/MiNNLO/converted_with_13TeV_NLO/converted_lhe.npz', i=3)
print(f'MiNNLO x1_plt.shape: {x1_plt.shape}')

print(gc.collect()) # gabage collection


POWHEG hvq x0_plt.shape:     (9553938, 3, 9)
POWHEG hvq x0_plt_nrm.shape: (9553938, 3, 9)
MiNNLO x1_plt.shape: (9553938, 3, 9)
33


In [49]:
# get exponential weights

x0_wgts_top = get_exponential_wgts(x0_1, 1, 0) # exp wgts for top pt
x0_wgts_anti_top = get_exponential_wgts(x0_1, 2, 0) # exp wgts for anti-top pt

x0_exp_wgts = np.add(x0_wgts_top, x0_wgts_anti_top)
x0_nrm_exp_wgts = x0_exp_wgts/np.mean(x0_exp_wgts)

# X1 MiNNLO
x1_wgts_top = get_exponential_wgts(x1_plt, 1, 0) # exp wgts for top pt
x1_wgts_anti_top = get_exponential_wgts(x1_plt, 2, 0) # exp wgts for anti-top pt

x1_exp_wgts = np.add(x1_wgts_top, x1_wgts_anti_top)
x1_nrm_exp_wgts = x1_exp_wgts/np.mean(x1_exp_wgts)

# normalized event generator weights
x0_1_nrm_wgts = x0_1[:,0,7]
x0_1_nrm_wgts /= np.mean(x0_1_nrm_wgts)

x0_2_nrm_wgts = x0_2[:,0,7]
x0_2_nrm_wgts /= np.mean(x0_2_nrm_wgts)


x1_nrm_wgts = x1_plt[:,0,7]
x1_nrm_wgts /= np.mean(x1_nrm_wgts)

print(gc.collect()) # gabage collection


0


In [50]:
# set weights for first datasets to exp_wgt*nrm_wgts
x0_1_nrm[:,0,7] = x0_1_nrm_wgts * x0_nrm_exp_wgts
x1_1_nrm[:,0,7] = x1_nrm_wgts * x1_nrm_exp_wgts

# set event gen wgts
regular_multiplier = 10 # multiplier for regular events importance | 0.2x and 1x did not lead to satisfactory results; trying 10x
x0_2_nrm[:,0,7] = regular_multiplier * x0_2_nrm_wgts
x1_2_nrm[:,0,7] = regular_multiplier * x1_nrm_wgts

# create concatenated x0 and x1 datasets with normal and exp weights
x0_nrm = np.concatenate((x0_1_nrm, x0_2_nrm))
x1_nrm = np.concatenate((x1_1_nrm, x1_2_nrm))

x0_nrm[:,0,7] /= np.mean(x0_nrm[:,0,7]) # adjust wgts so mean is 1 across entire concated datasets incorporating above multipliers
x1_nrm[:,0,7] /= np.mean(x1_nrm[:,0,7])

print(f'hvq    x0_nrm.shape: {x0_nrm.shape}')
print(f'MiNNLO x1_nrm.shape: {x1_nrm.shape}')

print(gc.collect()) # gabage collection


hvq    x0_nrm.shape: (19107876, 3, 9)
MiNNLO x1_nrm.shape: (19107876, 3, 9)
0


In [51]:
# delete eta (pseudorapidity) and Energy -> Train only with [pt, y, phi, m, PID]

# delete energy
x0_nrm = np.delete(x0_nrm, 5, -1)
x0_plt_nrm = np.delete(x0_plt_nrm, 5, -1)
x1_nrm = np.delete(x1_nrm, 5, -1)

# delete eta
x0_nrm = np.delete(x0_nrm, 4, -1)
x0_plt_nrm = np.delete(x0_plt_nrm, 4, -1)
x1_nrm = np.delete(x1_nrm, 4, -1)

print(gc.collect()) # gabage collection


0


In [52]:
# prep arrays for training
x_train, x_val, y_train, y_val, wgt_train, wgt_val = DCTR.prep_arrays(x0_nrm, x1_nrm, val=0.2)

# bring into shape for training loop
train_data = (x_train, y_train, x_val, y_val, wgt_train, wgt_val)
plt_data = (x0_plt , x0_plt_nrm, x1_plt, x1_nrm_wgts)


In [53]:
K.clear_session() # clear gpu memory

# clear temp arrays and variables from memory
del wgt_train, wgt_val, x0_1, x0_1_nrm, x0_1_nrm_wgts, x0_2, x0_2_nrm, x0_2_nrm_wgts, x0_exp_wgts, x0_nrm, x0_nrm_exp_wgts
del x0_plt, x0_plt_nrm, x0_wgts_anti_top, x0_wgts_top, x1_1_nrm, x1_2_nrm, x1_exp_wgts, x1_nrm, x1_nrm_exp_wgts
del x1_nrm_wgts, x1_plt, x1_wgts_anti_top, x1_wgts_top, x_train, x_val, y_train, y_val

print(gc.collect()) # gabage collection


65


# training loop

In [27]:
train_dir = './train_20240415' # where to save models during training


In [28]:
# start training loop
''' train_loop() necessary arguments
train_data, plt_data

default arguments:
model=None, lowest_chi2 = 1e6, train_dir = '/tf/home/gdrive/_STUDIUM_/DCTR_Paper/train',
batch_sizes=[4*8192, 8*8192, 16*8192, 32*8192], repeat=5, super_epochs=35, super_patience = 5, epochs = 8, starting_super_epoch = 1, 
input_dim=5, Phi_sizes = (100,100,128), F_sizes = (128,100,100), loss = 'mse', dropout=0.0, l2_reg=0.0, 
Phi_acts=('linear', 'gelu', 'gelu'), F_acts=('gelu', 'gelu', 'linear'), output_act='sigmoid', learning_rate=0.001

returns: best_model_list, lowest_chi2_list, lowest_loss_list
'''
best_model_list, lowest_chi2_list, _ = DCTR.train_loop(train_data, plt_data, batch_sizes=[24*8192, 32*8192, 40*8192], repeat=7, super_epochs=5,
                                                       train_dir = train_dir, epochs=15, learning_rate=0.001)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


starting super_epoch 1

starting training with batch_size: 196608 and 15 epochs
starting with weights from model: None
starting run 0 of super_epoch 1 with batch_size 196608


2024-04-15 13:14:39.538964: I tensorflow/stream_executor/cuda/cuda_driver.cc:739] failed to allocate 7.50G (8053063680 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-04-15 13:14:39.656081: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


reset neural network weights


2024-04-15 13:14:59.641185: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets

Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0005707376869395375.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2389 of run 0 of super_epoch 1 with batch_size 196608

starting run 1 of super_epoch 1 with batch_size 196608
reset neural network weights


2024-04-15 13:19:00.006737: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 0.0006000000284984708.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
clearing keras session and collecting garbage

 best loss 0.2375 of run 1 of super_epoch 1 with batch_size 196608

starting run 2 of super_epoch 1 with batch_size 196608
reset neural network weights


2024-04-15 13:23:18.232630: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets

Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0005707376869395375.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
clearing keras session and collecting garbage

 best loss 0.2391 of run 2 of super_epoch 1 with batch_size 196608

starting run 3 of super_epoch 1 with batch_size 196608
reset neural network weights


2024-04-15 13:27:37.776542: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets

Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0005822673439979553.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
clearing keras session and collecting garbage

 best loss 0.2366 of run 3 of super_epoch 1 with batch_size 196608

starting run 4 

2024-04-15 13:31:59.113945: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets

Epoch 14: ReduceLROnPlateau reducing learning rate to 0.0005764737026765942.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
clearing keras session and collecting garbage

 best loss 0.2369 of run 4 of super_epoch 1 with batch_size 196608

starting run 5 

2024-04-15 13:36:19.736903: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
clearing keras session and collecting garbage

 best loss 0.2388 of run 5 of super_epoch 1 with batch_size 19

2024-04-15 13:40:42.383974: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
clearing keras session and collecting garbage

 best loss 0.2389 of run 6 of super_epoch 1 with batch_size 19

2024-04-15 13:49:58.654030: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2390 of run 0 of super_epoch 1 with batch_size 26

2024-04-15 13:54:40.087699: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
clearing keras session and collecting garbage

 best loss 0.2388 of run 1 of super_epoch 1 with batch_size 262144

starting run 2 of super_epoch 1 with batch_size 262144
reset neural network weights


2024-04-15 13:59:27.418555: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
clearing keras session and collecting garbage

 best loss 0.2383 of run 2 of super_epoch 1 with batch_size 262144

starting run 3 of super_epoch 1 with batch_size 262144
reset neural network weights


2024-04-15 14:03:55.723564: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
clearing keras session and collecting garbage

 best loss 0.2386 of run 3 of super_epoch 1 with batch_size 262144

starting run 4 of super_epoch 1 with batch_size 262144
reset neural network weights


2024-04-15 14:08:17.446924: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
clearing keras session and collecting garbage

 best loss 0.2382 of run 4 of super_epoch 1 with batch_size 262144

starting run 5 of super_epoch 1 with batch_size 262144
reset neural network weights


2024-04-15 14:12:38.180454: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets

Epoch 14: ReduceLROnPlateau reducing learning rate to 0.0005764737026765942.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
clearing keras session and coll

2024-04-15 14:17:00.887887: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
clearing keras session and collecting garbage

 best loss 0.2385 of run 6 of super_epoch 1 with batch_size 262144

calculating stats for 7 models

finished 7 runs of batch_size 262144
in super epoch 1
with be

2024-04-15 14:25:48.540657: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_0/s-1_b-327680_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2387 of run 0 of super_epoch 1 with batch_size 32

2024-04-15 14:30:10.532826: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-327680_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-327680_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-327680_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-327680_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-327680_r-1.tf/assets

Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0005881192395463586.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-327680_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_1/s-1_b-327680_r-1.tf/assets
clearing keras session and collecting garbage

 best loss 0.2363 of run 1 of super_epoch 1 with batch_size 327680

starting run 2 of super_epoch 1 with batch_size 327680
reset neural network weights


2024-04-15 14:34:30.852097: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-327680_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-327680_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-327680_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-327680_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-327680_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-327680_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_2/s-1_b-327680_r-2.tf/assets
clearing keras session and collecting garbage

 best loss 0.2384 of run 2 of super_epoch 1 with batch_size 327680

starting run 3 of super_epoch 1 with batch_size 327680
reset neural network weights


2024-04-15 14:38:50.426098: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_3/s-1_b-327680_r-3.tf/assets
clearing keras session and collecting garbage

 best loss 0.2387 of run 3 of super_epoch 1 with batch_size 327680

starting run 4 of super_epoch 1 with batch_size 327680
reset neural network weights


2024-04-15 14:43:10.817630: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets

Epoch 14: ReduceLROnPlateau reducing learning rate to 0.0005764737026765942.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_4/s-1_b-327680_r-4.tf/assets
clearing keras session and collecting garbage

 best loss 0.2381 of run 4 of super_epoch 1 with batch_size 327680

starting run 5 

2024-04-15 14:47:31.062278: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-327680_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-327680_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-327680_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-327680_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-327680_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-327680_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_5/s-1_b-327680_r-5.tf/assets
clearing keras session and collecting garbage

 best loss 0.2384 of run 5 of super_epoch 1 with batch_size 327680

starting run 6 of super_epoch 1 with batch_size 327680
reset neural network weights


2024-04-15 14:51:46.579817: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_1/run_6/s-1_b-327680_r-6.tf/assets
clearing keras session and collecting garbage

 best loss 0.2389 of run 6 of super_epoch 1 with batch_size 32

In [29]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

model_list = []
chi2_list = []
# continue training loop with best model as starting point and smaller batch_ sizes and less repeats
model_list, chi2_list, _ = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[16*8192, 24*8192, 32*8192], repeat=4, super_epochs=5,
                                           starting_super_epoch=6, train_dir = train_dir, epochs=15, learning_rate=0.0007)

if len(model_list) >= 1: # only if there was an improvement
    best_model_list.append(model_list)
    lowest_chi2_list.append(chi2_list)

    best_model = best_model_list[-1]
    lowest_chi2 = lowest_chi2_list[-1]


81651
starting super_epoch 6

starting training with batch_size: 131072 and 15 epochs
starting with weights from model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
starting run 0 of super_epoch 6 with batch_size 131072
loaded neural network model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets

Epoch 8: ReduceLROnPlateau reducing learning rate to 0.00019544490496627986.

Epoch 14: ReduceLROnPlateau reducing learning rate to 0.0001126688439399004.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.1867 of run 0 of super_epoch 6 with batch_size 131072

starting run 1 of super_epoch 6 with batch_size 131072
loaded neural network model: ./train_20240415/super_epoch_2

KeyboardInterrupt: 

In [41]:
# stopped block before early, and changed the starting super epoch to match the actually completed super epochs
    # stopped b/c results were not great after 7 super_epochs 
        # -> balanced high pt exp weights with regular event gen weights (begore regular weights were only 20% as important)

best_model = './train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf'
lowest_chi2 = 51.5207

K.clear_session() 
print(gc.collect()) # cpu gabage collection

model_list = []
chi2_list = []
# continue training loop with best model as starting point and smaller batch_sizes, less repeats and lower learning_rate
model_list, chi2_list, _ = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192, 24*8192], repeat=3, super_epochs=5,
                                           starting_super_epoch=8, train_dir = train_dir, epochs=25, learning_rate=0.0005)


if len(model_list) >= 1:
    best_model_list.append(model_list)
    lowest_chi2_list.append(chi2_list)

    best_model = best_model_list[-1]
    lowest_chi2 = lowest_chi2_list[-1]



100
starting super_epoch 8

starting training with batch_size: 65536 and 25 epochs
starting with weights from model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
starting run 0 of super_epoch 8 with batch_size 65536
loaded neural network model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_8/run_0/s-8_b-65536_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0001935001986566931.

Epoch 21: ReduceLROnPlateau reducing learning rate to 0.00010505172831472009.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_8/run_0/s-8_b-65536_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2027 of run 0 of super_epoch 8 with batch_size 65536

starting run 1 of super_epoch 8 with batch_size 65536
loaded neural network model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_8/run_1/s-8_b-6

In [42]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

# finish training loop with best model as starting point and smaller batch_sizes and lower learning_rate and 10% dropout
model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192], repeat=2, super_epochs=10, 
                                                   starting_super_epoch=13, train_dir = train_dir, epochs=25, learning_rate=0.0003, dropout=0.1)


if len(model_list) >= 1:
    best_model_list.append(model_list)
    lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


111668
starting super_epoch 13

starting training with batch_size: 65536 and 25 epochs
starting with weights from model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
starting run 0 of super_epoch 13 with batch_size 65536
loaded neural network model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_13/run_0/s-13_b-65536_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0001935001986566931.

Epoch 21: ReduceLROnPlateau reducing learning rate to 0.00010505172831472009.
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_13/run_0/s-13_b-65536_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2022 of run 0 of super_epoch 13 with batch_size 65536

starting run 1 of super_epoch 13 with batch_size 65536
loaded neural network model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
INFO:tensorflow:Assets written to: ./train_20240415/super_epoch_13/r

KeyboardInterrupt: 

In [None]:
# training above was leading nowhere
# restarted training with '../best_model.tf' as starting point 


In [43]:
train_dir = './train_20240416' # where to save models during training
# using previous training as starting point
best_model = '../best_model.tf'
lowest_chi2 = 51.5207

K.clear_session() 
print(gc.collect()) # cpu gabage collection

# finish training loop with best model as starting point and smaller batch_sizes and lower learning_rate and 10% dropout
model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192], repeat=5, super_epochs=10, 
                                                   starting_super_epoch=1, train_dir = train_dir, epochs=20, learning_rate=0.0003, dropout=0.15)


if len(model_list) >= 1:
    best_model_list.append(model_list)
    lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]

# best chi2: 25.1403
    # ./train_20240416/super_epoch_1/run_2/s-1_b-65536_r-2.tf


140545
starting super_epoch 1

starting training with batch_size: 65536 and 20 epochs
starting with weights from model: ../best_model.tf
starting run 0 of super_epoch 1 with batch_size 65536
loaded neural network model: ../best_model.tf
INFO:tensorflow:Assets written to: ./train_20240416/super_epoch_1/run_0/s-1_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240416/super_epoch_1/run_0/s-1_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240416/super_epoch_1/run_0/s-1_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240416/super_epoch_1/run_0/s-1_b-65536_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 9.490818047197535e-06.

Epoch 19: ReduceLROnPlateau reducing learning rate to 5.256677286524791e-06.
INFO:tensorflow:Assets written to: ./train_20240416/super_epoch_1/run_0/s-1_b-65536_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2366 of run 0 of super_epoch 1 with batch_size 65536

sta

KeyboardInterrupt: 

In [56]:
# again with much more important regular events (non exp weights; from 0.2x to 1x and now to 10x)
    # since the above training was not working well enough for the bulk

train_dir = './train_20240416_b' # where to save models during training
# using previous training as starting point
best_model = '../best_model.tf'
lowest_chi2 = 6 # only use new models when significant progress is made | Base model has chi2 ~2; first epoch above had 25; which is not good enough to base subsequent runs on

K.clear_session() 
print(gc.collect()) # cpu gabage collection

# finish training loop with best model as starting point and smaller batch_sizes and lower learning_rate and 10% dropout
model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192], repeat=8, super_epochs=5, 
                                                   starting_super_epoch=1, train_dir = train_dir, epochs=20, learning_rate=0.0002, dropout=0.33333) # high dropout | maybe it will help against overfitting high pT


if len(model_list) >= 1:
    best_model_list.append(model_list)
    lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


31244
starting super_epoch 1

starting training with batch_size: 65536 and 20 epochs
starting with weights from model: ../best_model.tf
starting run 0 of super_epoch 1 with batch_size 65536
loaded neural network model: ../best_model.tf
INFO:tensorflow:Assets written to: ./train_20240416_b/super_epoch_1/run_0/s-1_b-65536_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 9.586202577338553e-06.

Epoch 17: ReduceLROnPlateau reducing learning rate to 5.3628693422069775e-06.
INFO:tensorflow:Assets written to: ./train_20240416_b/super_epoch_1/run_0/s-1_b-65536_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2374 of run 0 of super_epoch 1 with batch_size 65536

starting run 1 of super_epoch 1 with batch_size 65536
loaded neural network model: ../best_model.tf
INFO:tensorflow:Assets written to: ./train_20240416_b/super_epoch_1/run_1/s-1_b-65536_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240416_b/super_epoch_1/run_1/s-1_b-65536_r-1.tf/a

In [57]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

# finish training loop with best model as starting point and smaller batch_sizes and lower learning_rate and 10% dropout
model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192], repeat=8, super_epochs=5, 
                                                   starting_super_epoch=6, train_dir = train_dir, epochs=20, learning_rate=0.0001, dropout=0.05) # low dropout


if len(model_list) >= 1:
    best_model_list.append(model_list)
    lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


81651
starting super_epoch 6

starting training with batch_size: 65536 and 20 epochs
starting with weights from model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
starting run 0 of super_epoch 6 with batch_size 65536
loaded neural network model: ./train_20240415/super_epoch_2/run_4/s-2_b-262144_r-4.tf
INFO:tensorflow:Assets written to: ./train_20240416_b/super_epoch_6/run_0/s-6_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240416_b/super_epoch_6/run_0/s-6_b-65536_r-0.tf/assets

Epoch 10: ReduceLROnPlateau reducing learning rate to 0.00019544490496627986.

Epoch 18: ReduceLROnPlateau reducing learning rate to 0.00010825103963725269.
INFO:tensorflow:Assets written to: ./train_20240416_b/super_epoch_6/run_0/s-6_b-65536_r-0.tf/assets
clearing keras session and collecting garbage

 best loss 0.2325 of run 0 of super_epoch 6 with batch_size 65536

starting run 1 of super_epoch 6 with batch_size 65536
loaded neural network model: ./train_20240415/super_epoch_

KeyboardInterrupt: 