In [1]:
# import system modules
import sys
import os
os.system('for a in /sys/bus/pci/devices/*; do echo 0 | tee -a $a/numa_node>/dev/null; done') # get rid of NUMA node warnings in my docker: https://github.com/tensorflow/tensorflow/issues/42738
import gc

# import standard numerical modules
import numpy as np
import math

# import machine learning modules
import tensorflow as tf
import keras.backend as K

gpu = tf.config.list_physical_devices('GPU') # make sure GPU usage is enabled
print(gpu)

sys.path.append('../20240503')
import DCTR


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# load data

In [2]:
# directory with pre converted lhe files as numpy arrays
data_dir = '../Data' # modify as needed


In [3]:
# Load POWHEG hvq x0 datasets
# x0_nrm for training, x0_plt and x0_plt_nrm for calculating stats used to decide which model performs best
# only contain tt-pair; every event has order: 
    # tt-pair, top, anti-top
# every particle has arguments: 
    # [pt, y, phi, mass, eta, E, PID, w, theta]
    # [0 , 1, 2  , 3   , 4  , 5, 6  , 7, 8    ]

# POWHEG hvq
x0_nrm = []
x0_nrm = np.load(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_01.npy')[:9543943] # 9543943 num of NNLO samples
print(f'POWHEG hvq x0_nrm.shape:     {x0_nrm.shape}')

# plotting data; different from training data; for calculating stats
x0_plt = []
x0_plt = np.load(f'{data_dir}/POWHEG_hvq/showered/converted_lhe_02.npy')[:9543943]
print(f'POWHEG hvq x0_plt.shape:     {x0_plt.shape}')

x0_plt_nrm = []
x0_plt_nrm = np.load(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_02.npy')[:9543943]
print(f'POWHEG hvq x0_plt_nrm.shape: {x0_plt_nrm.shape}')



POWHEG hvq x0_nrm.shape:     (9543943, 3, 9)
POWHEG hvq x0_plt.shape:     (9543943, 3, 9)
POWHEG hvq x0_plt_nrm.shape: (9543943, 3, 9)


In [6]:
# MiNNLO x1
# training data
x1_nrm = []
x1_nrm = np.load(f'{data_dir}/MiNNLO/showered/normed_lhe.npy')
print(f'MiNNLO all particles x1_nrm.shape: {x1_nrm.shape}')

# plotting data
x1_plt = []
x1_plt = np.load(f'{data_dir}/MiNNLO/showered/converted_lhe.npy')
print(f'MiNNLO all particles x1_plt.shape: {x1_plt.shape}')


MiNNLO all particles x1_nrm.shape: (9543943, 3, 9)
MiNNLO all particles x1_plt.shape: (9543943, 3, 9)


In [7]:
# get normalized event generator weights | all weigths = +/-1
x0_wgt = x0_nrm[:, 0, 7].copy()

x0_plt_wgt = x0_plt_nrm[:, 0, 7].copy() 

x1_wgt = x1_nrm[:, 0, 7].copy()


In [8]:
# check sum of wgt for both classes
print(f'x0 wgt sum: {np.sum(x0_wgt)}')
print(f'x1 wgt sum: {np.sum(x1_wgt)}')


x0 wgt sum: 9466735.0
x1 wgt sum: 7685031.0


In [9]:
# delete eta (pseudorapidity) and Energy -> Train only with [pt, y, phi, m, PID]

# delete energy
x0_nrm = np.delete(x0_nrm, 5, -1)
x0_plt_nrm = np.delete(x0_plt_nrm, 5, -1)
x1_nrm = np.delete(x1_nrm, 5, -1)

# delete eta
x0_nrm = np.delete(x0_nrm, 4, -1)
x0_plt_nrm = np.delete(x0_plt_nrm, 4, -1)
x1_nrm = np.delete(x1_nrm, 4, -1)


In [10]:
# prep arrays for training
x_train, x_val, y_train, y_val, wgt_train, wgt_val = DCTR.prep_arrays(x0_nrm, x1_nrm, val=0.25)

# bring into shape for training loop
train_data = (x_train, y_train, x_val, y_val, wgt_train, wgt_val)
plt_data = (x0_plt , x0_plt_nrm, x1_plt, x1_wgt)


2024-05-22 06:50:04.859586: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-22 06:50:05.433739: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


In [11]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection to free up memory from discarded temp arrays


500


# training loop

In [12]:
train_dir = './train_20240522' # where to save models during training


In [13]:
# start training loop
''' train_loop() necessary arguments
train_data, plt_data

default arguments:
model=None, lowest_chi2 = 1e6, train_dir = '/tf/home/gdrive/_STUDIUM_/DCTR_Paper/train',
batch_sizes=[4*8192, 8*8192, 16*8192, 32*8192], repeat=5, super_epochs=35, super_patience = 5, epochs = 8, starting_super_epoch = 1, 
input_dim=5, Phi_sizes = (100,100,128), F_sizes = (128,100,100), loss = 'mse', dropout=0.0, l2_reg=0.0, 
Phi_acts=('linear', 'gelu', 'gelu'), F_acts=('gelu', 'gelu', 'linear'), output_act='sigmoid', learning_rate=0.001

returns: best_model_list, lowest_chi2_list, lowest_loss_list
'''
best_model_list, lowest_chi2_list, lowest_loss_list = DCTR.train_loop(train_data, plt_data, batch_sizes=[24*8192, 32*8192], repeat=5, super_epochs=5, 
                                                                      train_dir = train_dir, epochs=10, learning_rate=0.001)


best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


starting super_epoch 1

starting training with batch_size: 196608 and 10 epochs
starting with weights from model: None
starting run 0 of super_epoch 1 with batch_size 196608
reset neural network weights


2024-05-22 06:50:06.018003: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5
2024-05-22 06:50:16.668161: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tenso

2024-05-22 06:51:41.825952: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tenso

2024-05-22 06:53:16.567336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tenso

2024-05-22 06:54:51.344368: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tenso

2024-05-22 06:56:26.568180: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tenso

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 196608
in super epoch 1
with best model ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf
with chi2 7.7983 and loss 0.2140
starting training with batch_size: 262144 and 10 epochs
starting with weights from model: None
starting run 0 of super_epoch 1 with batch_size 262144
reset neural network weights


2024-05-22 07:01:29.196050: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tenso

2024-05-22 07:03:04.022086: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tenso

2024-05-22 07:04:38.958116: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tenso

2024-05-22 07:06:13.919198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tenso

2024-05-22 07:07:48.317463: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6262 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tenso

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 262144
in super epoch 1
with best model ./train_20240522/super_epoch_1/run_0/s-1_b-262144_r-0.tf
with chi2 12.5142 and loss 0.2143


finished super_epoch 1 with 5 runs each with batch_sizes:[196608, 262144]
best model./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tfwith chi2 7.7983 and loss 0.2140
starting super_epoch 2

starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf
starting run 0 of super_epoch 2 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/su

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 196608
in super epoch 2
with best model ./train_20240522/super_epoch_2/run_1/s-2_b-196608_r-1.tf
with chi2 4.2765 and loss 0.2135
starting training with batch_size: 262144 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf
starting run 0 of super_epoch 2 with batch_size 262144
loaded neural network model: ./train_20240522/super_epoch_1/run_2/s-1_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_2/run_0/s-2_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/supe

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 262144
in super epoch 2
with best model ./train_20240522/super_epoch_2/run_3/s-2_b-262144_r-3.tf
with chi2 5.4171 and loss 0.2135


finished super_epoch 2 with 5 runs each with batch_sizes:[196608, 262144]
best model./train_20240522/super_epoch_2/run_1/s-2_b-196608_r-1.tfwith chi2 4.2765 and loss 0.2135
starting super_epoch 3

starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_2/run_1/s-2_b-196608_r-1.tf
starting run 0 of super_epoch 3 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_2/run_1/s-2_b-196608_r-1.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_3/run_0/s-3_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_3/run_0/s-3_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_3/run_0/s-3_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/sup

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 196608
in super epoch 3
with best model ./train_20240522/super_epoch_3/run_2/s-3_b-196608_r-2.tf
with chi2 3.6620 and loss 0.2134
starting training with batch_size: 262144 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_2/run_1/s-2_b-196608_r-1.tf
starting run 0 of super_epoch 3 with batch_size 262144
loaded neural network model: ./train_20240522/super_epoch_2/run_1/s-2_b-196608_r-1.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_3/run_0/s-3_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_3/run_0/s-3_b-262144_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0003600000170990825.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_3/run_0/s-3_b-262144_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 0.00021600000327453016.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_3/run_0/s-3_b-262144_r-0.tf/assets


  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 262144
in super epoch 3
with best model ./train_20240522/super_epoch_3/run_0/s-3_b-262144_r-0.tf
with chi2 3.7954 and loss 0.2134


finished super_epoch 3 with 5 runs each with batch_sizes:[196608, 262144]
best model./train_20240522/super_epoch_3/run_2/s-3_b-196608_r-2.tfwith chi2 3.6620 and loss 0.2134
starting super_epoch 4

starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_3/run_2/s-3_b-196608_r-2.tf
starting run 0 of super_epoch 4 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_3/run_2/s-3_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_4/run_0/s-4_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_4/run_0/s-4_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_4/run_0/s-4_b-196608_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 0

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 196608
in super epoch 4
with best model ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
with chi2 3.2325 and loss 0.2134
starting training with batch_size: 262144 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_3/run_2/s-3_b-196608_r-2.tf
starting run 0 of super_epoch 4 with batch_size 262144
loaded neural network model: ./train_20240522/super_epoch_3/run_2/s-3_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_4/run_0/s-4_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_4/run_0/s-4_b-262144_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00012960000021848827.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_4/run_0/s-4_b-262144_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 7.775999838486313e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_4/run_0/s-4_b-262144_r-0.tf/assets


  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 262144
in super epoch 4
with best model ./train_20240522/super_epoch_4/run_2/s-4_b-262144_r-2.tf
with chi2 3.7753 and loss 0.2134


finished super_epoch 4 with 5 runs each with batch_sizes:[196608, 262144]
best model./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tfwith chi2 3.2325 and loss 0.2134
starting super_epoch 5

starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
starting run 0 of super_epoch 5 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_5/run_0/s-5_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_5/run_0/s-5_b-196608_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 4.6655999904032795e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_5/run_0/s-5_

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 196608
in super epoch 5
with best model ./train_20240522/super_epoch_5/run_2/s-5_b-196608_r-2.tf
with chi2 4.0507 and loss 0.2133
starting training with batch_size: 262144 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
starting run 0 of super_epoch 5 with batch_size 262144
loaded neural network model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_5/run_0/s-5_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_5/run_0/s-5_b-262144_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 4.6655999904032795e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_5/run_0/s-5_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_5/run_0/s-5_b-262144_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 2.799360081553459e-05.


  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 5 runs of batch_size 262144
in super epoch 5
with best model ./train_20240522/super_epoch_5/run_3/s-5_b-262144_r-3.tf
with chi2 3.5505 and loss 0.2133
no improvement, lowering learnng_rate to 0.001


finished super_epoch 5 with 5 runs each with batch_sizes:[196608, 262144]
best model./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tfwith chi2 3.2325 and loss 0.2134



finished loop of 5 super_epochs
with batch_sizes:[196608, 262144]
best model./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
with chi2 3.2325 and loss 0.2134


In [14]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

# continue training loop with best model as starting point and smaller batch_ sizes and less repeats

model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[16*8192, 24*8192], repeat=4, super_epochs=4, 
                                                   starting_super_epoch=6, train_dir = train_dir, epochs=10, learning_rate=0.001)

if len(model_list) >= 1:
    best_model_list.extend(model_list)
    lowest_chi2_list.extend(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


54808
starting super_epoch 6

starting training with batch_size: 131072 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
starting run 0 of super_epoch 6 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 4.6655999904032795e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 2.799360081553459e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written t

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 131072
in super epoch 6
with best model ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
with chi2 3.1725 and loss 0.2133
starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
starting run 0 of super_epoch 6 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_4/run_2/s-4_b-196608_r-2.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-196608_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 4.6655999904032795e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-196608_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 2.799360081553459e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_6/run_0/s-6_b-196608_r-0.tf/assets


  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 196608
in super epoch 6
with best model ./train_20240522/super_epoch_6/run_1/s-6_b-196608_r-1.tf
with chi2 3.9335 and loss 0.2133


finished super_epoch 6 with 4 runs each with batch_sizes:[131072, 196608]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 0.2133
starting super_epoch 7

starting training with batch_size: 131072 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 7 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_7/run_0/s-7_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_7/run_0/s-7_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_7/run_0/s-7_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/sup

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 131072
in super epoch 7
with best model ./train_20240522/super_epoch_7/run_0/s-7_b-131072_r-0.tf
with chi2 3.9223 and loss 0.2133
starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 7 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_7/run_0/s-7_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_7/run_0/s-7_b-196608_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_7/run_0/s-7_b-196608_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.007769642455969e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_7/run_0/s-7_b-196608_r-0.tf/assets


  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 196608
in super epoch 7
with best model ./train_20240522/super_epoch_7/run_2/s-7_b-196608_r-2.tf
with chi2 3.5232 and loss 0.2133
no improvement, lowering learnng_rate to 0.001


finished super_epoch 7 with 4 runs each with batch_sizes:[131072, 196608]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 0.2133
starting super_epoch 8

starting training with batch_size: 131072 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 8 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_8/run_0/s-8_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_8/run_0/s-8_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_8/run_0/s-8_b-131072_r-0.tf/assets

Epoch 5

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 131072
in super epoch 8
with best model ./train_20240522/super_epoch_8/run_1/s-8_b-131072_r-1.tf
with chi2 4.0800 and loss 0.2133
starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 8 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_8/run_0/s-8_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_8/run_0/s-8_b-196608_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_8/run_0/s-8_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_8/run_0/s-8_b-196608_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.007769642455969e-05.


  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 196608
in super epoch 8
with best model ./train_20240522/super_epoch_8/run_2/s-8_b-196608_r-2.tf
with chi2 4.2036 and loss 0.2133
no improvement, lowering learnng_rate to 0.001


finished super_epoch 8 with 4 runs each with batch_sizes:[131072, 196608]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 0.2133
starting super_epoch 9

starting training with batch_size: 131072 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 9 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_9/run_0/s-9_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_9/run_0/s-9_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_9/run_0/s-9_b-131072_r-0.tf/assets

Epoch 5

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 131072
in super epoch 9
with best model ./train_20240522/super_epoch_9/run_2/s-9_b-131072_r-2.tf
with chi2 3.9803 and loss 0.2133
starting training with batch_size: 196608 and 10 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 9 with batch_size 196608
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_9/run_0/s-9_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_9/run_0/s-9_b-196608_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_9/run_0/s-9_b-196608_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.007769642455969e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_9/run_0/s-9_b-196608_r-0.tf/assets


  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 4 runs of batch_size 196608
in super epoch 9
with best model ./train_20240522/super_epoch_9/run_0/s-9_b-196608_r-0.tf
with chi2 3.9699 and loss 0.2133
no improvement, lowering learnng_rate to 0.001


finished super_epoch 9 with 4 runs each with batch_sizes:[131072, 196608]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 0.2133



finished loop of 4 super_epochs
with batch_sizes:[131072, 196608]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
with chi2 3.1725 and loss 0.2133


In [15]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

# continue training loop with best model as starting point and smaller batch_sizes, less repeats and lower learning_rate and larger number of (regular) epochs 

model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192], repeat=3, super_epochs=4, 
                                                   starting_super_epoch=10, train_dir = train_dir, epochs=25, learning_rate=0.0005)


if len(model_list) >= 1:
    best_model_list.extend(model_list)
    lowest_chi2_list.extend(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


138512
starting super_epoch 10

starting training with batch_size: 65536 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 10 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 1.662903669057414e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets

Ep

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 65536
in super epoch 10
with best model ./train_20240522/super_epoch_10/run_2/s-10_b-65536_r-2.tf
with chi2 3.7596 and loss 0.2133
starting training with batch_size: 131072 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 10 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-131072_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 1.662903669057414e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_10/run_0/s-10_b-131072_r-0.tf/assets

Epoch 21: ReduceLROnPlateau reducing learning rate to 9.0279465439

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 131072
in super epoch 10
with best model ./train_20240522/super_epoch_10/run_0/s-10_b-131072_r-0.tf
with chi2 4.0767 and loss 0.2133
no improvement, lowering learnng_rate to 0.0005


finished super_epoch 10 with 3 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000
starting super_epoch 11

starting training with batch_size: 65536 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 11 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-65536_r-0.tf/assets



  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 65536
in super epoch 11
with best model ./train_20240522/super_epoch_11/run_0/s-11_b-65536_r-0.tf
with chi2 4.0160 and loss 0.2133
starting training with batch_size: 131072 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 11 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_11/run_0/s-11_b-131072_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learni

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 131072
in super epoch 11
with best model ./train_20240522/super_epoch_11/run_1/s-11_b-131072_r-1.tf
with chi2 4.2644 and loss 0.2133
no improvement, lowering learnng_rate to 0.0005


finished super_epoch 11 with 3 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000
starting super_epoch 12

starting training with batch_size: 65536 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 12 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_12/run_0/s-12_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_12/run_0/s-12_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_12/run_0/s-12_b-65536_r-0.tf/assets
I

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 65536
in super epoch 12
with best model ./train_20240522/super_epoch_12/run_1/s-12_b-65536_r-1.tf
with chi2 3.4675 and loss 0.2133
starting training with batch_size: 131072 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 12 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_12/run_0/s-12_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_12/run_0/s-12_b-131072_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 1.662903669057414e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_12/run_0/s-12_b-131072_r-0.tf/assets

Epoch 21: ReduceLROnPlateau reducing learning rate to 9.027946543938014e-06.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_12/run_0/s-12_b-131072_r-0

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 131072
in super epoch 12
with best model ./train_20240522/super_epoch_12/run_0/s-12_b-131072_r-0.tf
with chi2 4.2350 and loss 0.2133
no improvement, lowering learnng_rate to 0.0005


finished super_epoch 12 with 3 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000
starting super_epoch 13

starting training with batch_size: 65536 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 13 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_13/run_0/s-13_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_13/run_0/s-13_b-65536_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 1.662903669057414e-05.
INFO:tensorflow:Assets 

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 65536
in super epoch 13
with best model ./train_20240522/super_epoch_13/run_2/s-13_b-65536_r-2.tf
with chi2 3.5648 and loss 0.2133
starting training with batch_size: 131072 and 25 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 13 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_13/run_0/s-13_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_13/run_0/s-13_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_13/run_0/s-13_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_13/run_0/s-13_b-131072_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 1.662903669057414e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_13/

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 3 runs of batch_size 131072
in super epoch 13
with best model ./train_20240522/super_epoch_13/run_2/s-13_b-131072_r-2.tf
with chi2 4.2437 and loss 0.2133
no improvement, lowering learnng_rate to 0.00035


finished super_epoch 13 with 3 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000



finished loop of 4 super_epochs
with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
with chi2 3.1725 and loss 1.0000


In [16]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

# continue training loop with best model as starting point and small dropout

model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192], repeat=6, super_epochs=4, 
                                                   starting_super_epoch=15, train_dir = train_dir, epochs=20, learning_rate=0.0002, dropout=0.1)


if len(model_list) >= 1:
    best_model_list.extend(model_list)
    lowest_chi2_list.extend(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


111568
starting super_epoch 15

starting training with batch_size: 65536 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 15 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-65536_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-65536_r-0.tf/assets

Ep

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 65536
in super epoch 15
with best model ./train_20240522/super_epoch_15/run_3/s-15_b-65536_r-3.tf
with chi2 3.8040 and loss 0.2133
starting training with batch_size: 131072 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 15 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-131072_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/run_0/s-15_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_15/

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 131072
in super epoch 15
with best model ./train_20240522/super_epoch_15/run_1/s-15_b-131072_r-1.tf
with chi2 3.9705 and loss 0.2133
no improvement, lowering learnng_rate to 0.0002


finished super_epoch 15 with 6 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000
starting super_epoch 16

starting training with batch_size: 65536 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 16 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/run_0/s-16_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/run_0/s-16_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/run_0/s-16_b-65536_r-0.tf/assets



  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 65536
in super epoch 16
with best model ./train_20240522/super_epoch_16/run_3/s-16_b-65536_r-3.tf
with chi2 3.5293 and loss 0.2133
starting training with batch_size: 131072 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 16 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/run_0/s-16_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/run_0/s-16_b-131072_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/run_0/s-16_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/run_0/s-16_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_16/

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 131072
in super epoch 16
with best model ./train_20240522/super_epoch_16/run_1/s-16_b-131072_r-1.tf
with chi2 4.1044 and loss 0.2133
no improvement, lowering learnng_rate to 0.0002


finished super_epoch 16 with 6 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000
starting super_epoch 17

starting training with batch_size: 65536 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 17 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_17/run_0/s-17_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_17/run_0/s-17_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_17/run_0/s-17_b-65536_r-0.tf/assets
I

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 65536
in super epoch 17
with best model ./train_20240522/super_epoch_17/run_4/s-17_b-65536_r-4.tf
with chi2 4.1899 and loss 0.2133
starting training with batch_size: 131072 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 17 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_17/run_0/s-17_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_17/run_0/s-17_b-131072_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_17/run_0/s-17_b-131072_r-0.tf/assets

Epoch 17: ReduceLROnPlateau reducing learning rate to 9.396383029525169e-06.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_17/run_0/s-17_b-131072_r-0

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 131072
in super epoch 17
with best model ./train_20240522/super_epoch_17/run_5/s-17_b-131072_r-5.tf
with chi2 3.6856 and loss 0.2133
no improvement, lowering learnng_rate to 0.0002


finished super_epoch 17 with 6 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000
starting super_epoch 18

starting training with batch_size: 65536 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 18 with batch_size 65536
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_18/run_0/s-18_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_18/run_0/s-18_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_18/run_0/s-18_b-65536_r-0.tf/assets
I

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 65536
in super epoch 18
with best model ./train_20240522/super_epoch_18/run_5/s-18_b-65536_r-5.tf
with chi2 4.2101 and loss 0.2133
starting training with batch_size: 131072 and 20 epochs
starting with weights from model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
starting run 0 of super_epoch 18 with batch_size 131072
loaded neural network model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_18/run_0/s-18_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_18/run_0/s-18_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_18/run_0/s-18_b-131072_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets written to: ./train_20240522/super_epoch_18/run_0/s-18_b-131072_r-0.tf/assets

Epoch 17: ReduceLROnPlateau reducing learning rate to 9.3963830295

  chi2 = np.nansum(np.power(n_list[0] - n, 2)/(np.power(uncert, 2) + np.power(uncert_list[0], 2)))



finished 6 runs of batch_size 131072
in super epoch 18
with best model ./train_20240522/super_epoch_18/run_4/s-18_b-131072_r-4.tf
with chi2 4.1582 and loss 0.2133
no improvement, lowering learnng_rate to 0.00014


finished super_epoch 18 with 6 runs each with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tfwith chi2 3.1725 and loss 1.0000



finished loop of 4 super_epochs
with batch_sizes:[65536, 131072]
best model./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
with chi2 3.1725 and loss 1.0000


In [17]:
print(f'best model: {best_model}')
print(f'with mean red chi2 of: {lowest_chi2}')


best model: ./train_20240522/super_epoch_6/run_0/s-6_b-131072_r-0.tf
with mean red chi2 of: 3.172525910477412
