# Pretraining `num_rows` vs transfered performance
Pretrain fastsim weights for 1M, 2M, 4M, 8M, ... rows

Then transfer and do fixed fullsim transfer training

In [1]:
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from tqdm import tqdm
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import initializers

import wandb
from wandb.keras import WandbCallback

# https://gitlab.cern.ch/atlas/ATLAS-top-tagging-open-data/-/blob/master/preprocessing.py
import preprocessing

np.random.seed(8)
tf.random.set_seed(8)

physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2023-05-18 02:16:18.419644: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-05-18 02:16:18.444614: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:41:00.0 name: NVIDIA A40 computeCapability: 8.6
coreClock: 1.74GHz coreCount: 84 deviceMemorySize: 44.56GiB deviceMemoryBandwidth: 648.29GiB/s
2023-05-18 02:16:18.445006: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2023-05-18 02:16:18.447146: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2023-05-18 02:16:18.449358: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2023-05-18 02:16:18.449766: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2023-0

In [2]:
train_file_names = os.listdir("/global/ml4hep/spss/mfong/transfer_learning/delphes_train")
# for i in range(7, 15):
for i in range(1, 15):
    train_file_names.remove(f"train_{i}.h5")
train_file_names
# f = h5py.File('/clusterfs/ml4hep/mfong/transfer_learning/delphes_train.h5', 'r')
# f2 = h5py.File('/clusterfs/ml4hep/mfong/transfer_learning/delphes_test.h5', 'r')

['train_0.h5']

In [3]:
num_samples_per_file = []
for train_file_name in train_file_names:
    f = h5py.File("/global/ml4hep/spss/mfong/transfer_learning/delphes_train/" + train_file_name, 'r')
    num_samples_per_file.append(f["fjet_clus_eta"].shape[0])
num_samples = sum(num_samples_per_file)
feature_keys = ['fjet_clus_eta', 'fjet_clus_phi', 'fjet_clus_pt', 'fjet_clus_E']
num_features = 0
for k in feature_keys:
    num_features += f[k].shape[1]
# x = np.empty((num_samples, num_features))

In [4]:
current_row = 0
for train_file_name, current_num_samples in tqdm(zip(train_file_names, num_samples_per_file)):
    f = h5py.File("/global/ml4hep/spss/mfong/transfer_learning/delphes_train/" + train_file_name, 'r')
    
    # preprocess
    data_dict = {k:v for k, v in f.items() if k in feature_keys}
    x = preprocessing.constituent(data_dict, 200)       # TODO need to put these into preallocated x array
    x = x.reshape(x.shape[0], x.shape[1]*x.shape[2])
    # x = preprocessing.high_level(x)
    
    # x[current_row:current_row+current_num_samples] = np.concatenate([f[k] for k in feature_keys], axis=1)
    current_row += current_num_samples

  log_pt = np.log(pt)
  log_energy = np.log(energy)
  lognorm_pt = np.log(pt / sum_pt[:,np.newaxis])
  lognorm_energy = np.log(energy / sum_energy[:,np.newaxis])
1it [04:31, 271.89s/it]


In [5]:
feature_keys = ['fjet_clus_eta', 'fjet_clus_phi', 'fjet_clus_pt', 'fjet_clus_E']
for k in f.keys():
    print(k, f[k].shape)

fjet_clus_E (5000000, 200)
fjet_clus_eta (5000000, 200)
fjet_clus_phi (5000000, 200)
fjet_clus_pt (5000000, 200)
fjet_eta (5000000,)
fjet_m (5000000,)
fjet_phi (5000000,)
fjet_pt (5000000,)
labels (5000000,)
training_weights (5000000,)


In [6]:
y = np.zeros((num_samples))
current_row = 0
for train_file_name, current_num_samples in tqdm(zip(train_file_names, num_samples_per_file)):
    f = h5py.File("/global/ml4hep/spss/mfong/transfer_learning/delphes_train/" + train_file_name, 'r')
    y[current_row:current_row+current_num_samples] = f["labels"][:]
    current_row += current_num_samples

1it [00:00,  1.16it/s]


In [7]:
x.shape

(5000000, 1400)

In [8]:
y.shape

(5000000,)

In [9]:
num_samples = len(y)
# num_train_samples = int(0.8 * num_samples)
num_train_samples = num_samples - 2000000       # save 2M rows for test data
x_train = x[:num_train_samples]
y_train = y[:num_train_samples]

x_test = x[num_train_samples:]
y_test = y[num_train_samples:]

In [10]:
# scaler = StandardScaler()
# scaler.fit(x_train[:2000000])   # only use first 2M otherwise takes too long

# x_train = scaler.transform(x_train, copy=False)
# x_test = scaler.transform(x_test, copy=False)

In [11]:
os.environ["WANDB_NOTEBOOK_NAME"] = "pretrain_MLP.ipynb"

In [None]:
NUM_PRETRAIN_ROWS_LIST = [1000000, 2000000, 4000000]#, 8000000, 16000000, 32000000]
# config = wandb.config
# config.batch_size = 256
config = {
    "batch_size": 256,
    "epochs": 800,
}
for num_pretrain_rows in NUM_PRETRAIN_ROWS_LIST:
    # config.num_pretrain_rows = num_pretrain_rows
    config["num_pretrain_rows"] = num_pretrain_rows
    run = wandb.init(project="preprocess_pretrain_MLP", name=f"preprocess_fastsim_MLP_{int(num_pretrain_rows / 1000000)}M_rows", config=config, reinit=True)
    
    model = Sequential()
    model.add(Dense(64, input_shape=(x_train.shape[1],), activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(
        x_train[:config["num_pretrain_rows"]],
        y_train[:config["num_pretrain_rows"]],
        epochs=config["epochs"],
        batch_size=config["batch_size"],
        shuffle=True,
        validation_data=(x_test, y_test),
        callbacks=[wandb.keras.WandbCallback()]
    )
    
    plt.figure()
    plt.plot(history.history["accuracy"], label="acc")
    plt.plot(history.history["val_accuracy"], label="val_acc")
    plt.title(f"Fastsim MLP ({int(num_pretrain_rows / 1000000)}M Rows)")
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend()
    plt.savefig(f"output/preprocess_fastsim_MLP_{int(num_pretrain_rows / 1000000)}M_rows_acc.png")
    
    plt.figure()
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.title(f"Fastsim MLP ({int(num_pretrain_rows / 1000000)}M Rows)")
    plt.ylabel("Loss")
    plt.xlabel("Epoch")
    plt.legend()
    plt.savefig(f"output/preprocess_fastsim_MLP_{int(num_pretrain_rows / 1000000)}M_rows_loss.png")
    
    
    
    model.save_weights(f"models/preprocess_fastsim_MLP_{int(num_pretrain_rows / 1000000)}M_rows.h5")
    wandb.finish()

Epoch 1/800


2023-05-18 02:32:46.253687: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10




2023-05-18 02:34:48.746270: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 11200000000 exceeds 10% of free system memory.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


2023-05-18 02:35:06.467060: W tensorflow/python/util/util.cc:329] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /global/home/users/mfong/git/transfer-learning/wandb/run-20230518_023235-n2sc88on/files/model-best/assets


[34m[1mwandb[0m: Adding directory to artifact (/global/home/users/mfong/git/transfer-learning/wandb/run-20230518_023235-n2sc88on/files/model-best)... Done. 0.0s


Epoch 2/800

2023-05-18 02:35:16.967475: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 11200000000 exceeds 10% of free system memory.


Epoch 3/800