In [1]:
# import system modules
import sys
import os
os.system('for a in /sys/bus/pci/devices/*; do echo 0 | tee -a $a/numa_node>/dev/null; done') # get rid of NUMA node warnings: https://github.com/tensorflow/tensorflow/issues/42738
# os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
# print(os.getenv('TF_GPU_ALLOCATOR'))
import gc

# import standard numerical modules
import numpy as np
import math

# from numba import cuda # for memory management
# device = cuda.get_current_device() # for clearing memory
# print(device)

# import machine learning modules
import tensorflow as tf
import keras.backend as K

gpu = tf.config.list_physical_devices('GPU') # make sure GPU usage is enabled
tf.config.experimental.set_virtual_device_configuration(gpu[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=int(7.5*1024))])
print(gpu) 

# import DCTR
# path_to_DCTR = './' # uncomment and modify if needed
import DCTR


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# load data

In [2]:
# directory with pre converted lhe files as numpy arrays
data_dir = './Data' # modify as needed


In [3]:
# Load POWHEG hvq x0 datasets
# x0_nrm for training, x0_plt and x0_plt_nrm for calculating stats used to decide which model performs best
# only contain tt-pair; every event has order: 
    # tt-pair, top, anti-top
# every particle has arguments: 
    # [pt, y, phi, mass, eta, E, PID, w, theta]
    # [0 , 1, 2  , 3   , 4  , 5, 6  , 7, 8    ]

# POWHEG hvq
x0_nrm = []
x0_nrm = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/01-02_normed_converted_lhe.npz', i=3)[:int(1e7)]
print(f'POWHEG hvq x0_nrm.shape:     {x0_nrm.shape}')

# plotting data; different from training data
x0_plt = []
x0_plt = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/03-04_converted_lhe.npz', i=3)[:int(2e7)]
print(f'POWHEG hvq x0_plt.shape:     {x0_plt.shape}')

x0_plt_nrm = []
x0_plt_nrm = DCTR.load_dataset(f'{data_dir}/POWHEG_hvq/13TeV/03-04_normed_converted_lhe.npz', i=3)[:int(2e7)]
print(f'POWHEG hvq x0_plt_nrm.shape: {x0_plt_nrm.shape}')


POWHEG hvq x0_nrm.shape:     (10000000, 3, 9)
POWHEG hvq x0_plt.shape:     (20000000, 3, 9)
POWHEG hvq x0_plt_nrm.shape: (20000000, 3, 9)


In [4]:
# MiNNLO x1
# training data
x1_nrm = []
x1_nrm = DCTR.load_dataset(f'{data_dir}/MiNNLO/converted_with_13TeV_NLO/normed_converted_lhe.npz', i=3)
print(f'MiNNLO all particles x1_nrm.shape: {x1_nrm.shape}')

# plotting data
x1_plt = []
x1_plt = DCTR.load_dataset(f'{data_dir}/MiNNLO/converted_with_13TeV_NLO/converted_lhe.npz', i=3)
print(f'MiNNLO all particles x1_plt.shape: {x1_plt.shape}')


MiNNLO all particles x1_nrm.shape: (9553938, 3, 9)
MiNNLO all particles x1_plt.shape: (9553938, 3, 9)


In [5]:
# get normalized event generator weights
x0_wgt = x0_nrm[:, 0, 7].copy()
x0_wgt /= np.mean(x0_wgt) # adjust so mean is 1

x0_plt_wgt = x0_plt_nrm[:, 0, 7].copy()
x0_plt_wgt /= np.mean(x0_plt_wgt) 

x1_wgt = x1_nrm[:, 0, 7].copy()
x1_wgt /= np.mean(x1_wgt)


In [6]:
# delete eta (pseudorapidity) and Energy -> Train only with [pt, y, phi, m, PID]

# delete energy
x0_nrm = np.delete(x0_nrm, 5, -1)
x0_plt_nrm = np.delete(x0_plt_nrm, 5, -1)
x1_nrm = np.delete(x1_nrm, 5, -1)

# delete eta
x0_nrm = np.delete(x0_nrm, 4, -1)
x0_plt_nrm = np.delete(x0_plt_nrm, 4, -1)
x1_nrm = np.delete(x1_nrm, 4, -1)


In [7]:
# prep arrays for training
x_train, x_val, y_train, y_val, wgt_train, wgt_val = DCTR.prep_arrays(x0_nrm, x1_nrm, val=0.25)

# bring into shape for training loop
train_data = (x_train, y_train, x_val, y_val, wgt_train, wgt_val)
plt_data = (x0_plt , x0_plt_nrm, x1_plt, x1_wgt)


2024-03-30 10:54:07.728984: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-30 10:54:08.341299: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


# training loop

In [8]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection


44


In [9]:
train_dir = './train_20240330' # where to save models during training


In [10]:
# start training loop
''' train_loop() necessary arguments
train_data, plt_data

default arguments:
model=None, lowest_chi2 = 1e6, train_dir = '/tf/home/gdrive/_STUDIUM_/DCTR_Paper/train',
batch_sizes=[4*8192, 8*8192, 16*8192, 32*8192], repeat=5, super_epochs=35, super_patience = 5, epochs = 8, starting_super_epoch = 1, 
input_dim=5, Phi_sizes = (100,100,128), F_sizes = (128,100,100), loss = 'mse', dropout=0.0, l2_reg=0.0, 
Phi_acts=('linear', 'gelu', 'gelu'), F_acts=('gelu', 'gelu', 'linear'), output_act='sigmoid', learning_rate=0.001

returns: best_model_list, lowest_chi2_list, lowest_loss_list
'''
best_model_list, lowest_chi2_list, lowest_loss_list = DCTR.train_loop(train_data, plt_data, batch_sizes=[24*8192, 32*8192], repeat=7, super_epochs=5, 
                                                                      train_dir = train_dir, epochs=10, learning_rate=0.001)


best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


starting super_epoch 1

starting training with batch_size: 196608 and 10 epochs
starting with weights from model: None
starting run 0 of super_epoch 1 with batch_size 196608


2024-03-30 10:54:10.716787: I tensorflow/stream_executor/cuda/cuda_driver.cc:739] failed to allocate 7.50G (8053063680 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-03-30 10:54:10.820257: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


reset neural network weights


2024-03-30 10:54:21.488051: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-196608_r-0.tf/assets
INFO:tenso

2024-03-30 10:55:47.843463: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-196608_r-1.tf/assets
INFO:tenso

2024-03-30 10:57:23.421258: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-196608_r-2.tf/assets
INFO:tenso

2024-03-30 10:58:59.131891: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-196608_r-3.tf/assets
clearing k

2024-03-30 11:00:34.087186: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-196608_r-4.tf/assets
INFO:tenso

2024-03-30 11:02:10.091806: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-196608_r-5.tf/assets
INFO:tenso

2024-03-30 11:03:45.934950: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-196608_r-6.tf/assets
INFO:tenso

2024-03-30 11:14:00.453314: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_0/s-1_b-262144_r-0.tf/assets
INFO:tenso

2024-03-30 11:15:36.046692: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_1/s-1_b-262144_r-1.tf/assets
INFO:tenso

2024-03-30 11:17:11.664934: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_2/s-1_b-262144_r-2.tf/assets
INFO:tenso

2024-03-30 11:18:47.483200: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_3/s-1_b-262144_r-3.tf/assets
INFO:tenso

2024-03-30 11:20:23.310036: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_4/s-1_b-262144_r-4.tf/assets
INFO:tenso

2024-03-30 11:21:58.867245: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_5/s-1_b-262144_r-5.tf/assets
INFO:tenso

2024-03-30 11:23:33.989966: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7680 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:09:00.0, compute capability: 7.5


INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_1/run_6/s-1_b-262144_r-6.tf/assets
INFO:tenso

In [11]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

# continue training loop with best model as starting point and smaller batch_ sizes and less repeats

model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[16*8192, 24*8192], repeat=4, super_epochs=4, 
                                                   starting_super_epoch=6, train_dir = train_dir, epochs=10, learning_rate=0.001)


best_model_list.append(model_list)
lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


108498
starting super_epoch 6

starting training with batch_size: 131072 and 10 epochs
starting with weights from model: ./train_20240330/super_epoch_4/run_0/s-4_b-262144_r-0.tf
starting run 0 of super_epoch 6 with batch_size 131072
loaded neural network model: ./train_20240330/super_epoch_4/run_0/s-4_b-262144_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 7.775999838486313e-05.
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_6/run_0/s-6_b-131072_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 4.6655999904032795e-05.
INFO:tensorflow:Assets written 

In [12]:
K.clear_session() 
print(gc.collect()) # cpu gabage collection

# continue training loop with best model as starting point and smaller batch_sizes, less repeats and lower learning_rate

model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8*8192, 16*8192], repeat=3, super_epochs=4, 
                                                   starting_super_epoch=10, train_dir = train_dir, epochs=10, learning_rate=0.0005)


best_model_list.append(model_list)
lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]
# error b/c no improvement since last cell -> list was empty | fixed by appending to list instead of replacing list


139204
starting super_epoch 10

starting training with batch_size: 65536 and 10 epochs
starting with weights from model: ./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf
starting run 0 of super_epoch 10 with batch_size 65536
loaded neural network model: ./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 2.799360081553459e-05.
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_10/run_0/s-10_b-65536_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.6796160707599483e-05.
INFO:tensorflow:Assets wri

IndexError: list index out of range

In [13]:
# manual best model as starting point, since last cell errored out
# best_model='./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf'

K.clear_session() 
print(gc.collect()) # cpu gabage collection

K.clear_session()
# finish training loop with best model as starting point and smaller batch_sizes, less repeats and lower learning_rate but more epochs per super_epoch

model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8192, 4*8192, 8*8192, 16*8192], repeat=4, super_epochs=5, 
                                                   starting_super_epoch=14, train_dir = train_dir, epochs=25, learning_rate=0.0006)


best_model_list.append(model_list)
lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]
# error b/c no improvement since last cell -> list was empty | fixed by appending to list instead of replacing list

112398
starting super_epoch 14

starting training with batch_size: 8192 and 25 epochs
starting with weights from model: ./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf
starting run 0 of super_epoch 14 with batch_size 8192
loaded neural network model: ./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_14/run_0/s-14_b-8192_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_14/run_0/s-14_b-8192_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_14/run_0/s-14_b-8192_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_14/run_0/s-14_b-8192_r-0.tf/assets

Epoch 11: ReduceLROnPlateau reducing learning rate to 2.7715059695765375e-05.
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_14/run_0/s-14_b-8192_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_14/run_0/s-14_b-8192_r-0.tf/assets
INFO:tenso

IndexError: list index out of range

In [14]:
# manual best model as starting point, since last cell errored out
# best_model='./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf'

K.clear_session() 
print(gc.collect()) # cpu gabage collection

# finish training loop with best model as starting point and smaller batch_sizes and lower learning_rate and 15% dropout
model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[4*8192, 8*8192], repeat=7, super_epochs=3, 
                                                   starting_super_epoch=19, train_dir = train_dir, epochs=15, learning_rate=0.0002, dropout=0.15)


best_model_list.append(model_list)
lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]


139941
starting super_epoch 19

starting training with batch_size: 32768 and 15 epochs
starting with weights from model: ./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf
starting run 0 of super_epoch 19 with batch_size 32768
loaded neural network model: ./train_20240330/super_epoch_9/run_0/s-9_b-196608_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_19/run_0/s-19_b-32768_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_19/run_0/s-19_b-32768_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_19/run_0/s-19_b-32768_r-0.tf/assets

Epoch 7: ReduceLROnPlateau reducing learning rate to 2.799360081553459e-05.
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_19/run_0/s-19_b-32768_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_19/run_0/s-19_b-32768_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_19/run_0/s-19_b-32768_r-0.tf/assets

Epo

In [15]:

K.clear_session() 
print(gc.collect()) # cpu gabage collection

# finish training loop with best model as starting point and smaller batch_sizes and lower learning_rate and 15% dropout
model_list, chi2_list, loss_list = DCTR.train_loop(train_data, plt_data, model=best_model, lowest_chi2=lowest_chi2, batch_sizes=[8192, 4*8192, 8*8192,16*8192], repeat=5, super_epochs=10, 
                                                   starting_super_epoch=22, train_dir = train_dir, epochs=12, learning_rate=0.0002, dropout=0.1)


best_model_list.append(model_list)
lowest_chi2_list.append(chi2_list)

best_model = best_model_list[-1]
lowest_chi2 = lowest_chi2_list[-1]
# error b/c no improvement since last cell -> list was empty | fixed by appending to list instead of replacing list


81653
starting super_epoch 22

starting training with batch_size: 8192 and 12 epochs
starting with weights from model: ./train_20240330/super_epoch_21/run_0/s-21_b-32768_r-0.tf
starting run 0 of super_epoch 22 with batch_size 8192
loaded neural network model: ./train_20240330/super_epoch_21/run_0/s-21_b-32768_r-0.tf
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_22/run_0/s-22_b-8192_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_22/run_0/s-22_b-8192_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_22/run_0/s-22_b-8192_r-0.tf/assets
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_22/run_0/s-22_b-8192_r-0.tf/assets

Epoch 5: ReduceLROnPlateau reducing learning rate to 9.586202577338553e-06.
INFO:tensorflow:Assets written to: ./train_20240330/super_epoch_22/run_0/s-22_b-8192_r-0.tf/assets

Epoch 9: ReduceLROnPlateau reducing learning rate to 5.751721437263768e-06.
INFO:tensorflow:Assets written to

IndexError: list index out of range