In [1]:
# === PARETO FRONT STUDIES NOTEBOOK #1 ===
# Author: Ana Sofia Calle Muñoz

# This notebook allows you to perform a Pareto analysis for a specific scheduler over a given number of trials and epochs.
# Losses used: NLL and a sum of standard deviations sigma regularizer.
# !!! Throughout the notebooks, look for comments including "==" at the beginning, as they need to be modified on your end.

# == Before running:
# Download the 4 notebooks and schedulers.py file & upload to your cluster.
# Make sure you have the latest OptimizedDataGenerator_v2 python file.
# Upload the model you want to work with. This nb is set to work with the Conv2D Max model & 16x16 sensor size.
# Modify the train & validation TFrecords folder paths. This nb works with dataset_3sr filtered with labels.
# Modify the "intermediate_dir" path, that is where your results will save.
# Modify the sample hyperparameter ranges as you like. They are in the run_trials funtion.

# == Run:
# Go to the last block and look for the run_trials call. Set a scheduler, experiment name, number of trials and epochs. That's it!

# Any questions you have, you can reach out on the FastML slack as Ana Sofia Calle, or callea@purdue.edu

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.optimizers import Adam
import keras
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import Sequence
from keras.layers import Conv2D, MaxPooling2D
from qkeras import *

from keras.utils import Sequence
from keras.callbacks import CSVLogger
from keras.callbacks import EarlyStopping, Callback, LambdaCallback

import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import matplotlib.pyplot as plt

import copy

from tensorflow.keras.metrics import Mean

import os
import random
import csv

from OptimizedDataGenerator_v2 import OptimizedDataGenerator
from schedulers import *
import pickle
from models_16x16.models import *

pi = 3.14159265359

maxval=1e9
minval=1e-9

2025-07-24 01:06:08.543890: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-24 01:06:08.543952: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-24 01:06:08.545535: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-24 01:06:08.553650: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ==TFrecords paths
# I recommend using contained datasets for better results

tfrecords_dir_train = "/home/callea/TFrecords_3src_filtered/train"
tfrecords_dir_validation = "/home/callea/TFrecords_3src_filtered/test"

In [None]:
# Custom loss: NLL and a sum of standard deviations sigma regularizer (you can modify the regularizer).

current_reg_weight = tf.Variable(0.0, trainable=False, dtype=tf.float32, name='reg_weight')

def custom_loss(y, p_base, minval=1e-9, maxval=1e9, scale = 512):

    reg_weight = current_reg_weight
    
    p = p_base
    
    mu = p[:, 0:8:2]
    
    # creating each matrix element in 4x4
    Mdia = minval + tf.math.maximum(p[:, 1:8:2], 0.0)
    Mcov = p[:,8:]
    
    # placeholder zero element
    zeros = tf.zeros_like(Mdia[:,0])
    
    # assembles scale_tril matrix
    row1 = tf.stack([Mdia[:,0],zeros,zeros,zeros])
    row2 = tf.stack([Mcov[:,0],Mdia[:,1],zeros,zeros])
    row3 = tf.stack([Mcov[:,1],Mcov[:,2],Mdia[:,2],zeros])
    row4 = tf.stack([Mcov[:,3],Mcov[:,4],Mcov[:,5],Mdia[:,3]])

    scale_tril = tf.transpose(tf.stack([row1,row2,row3,row4]),perm=[2,0,1])

    dist = tfp.distributions.MultivariateNormalTriL(loc = mu, scale_tril = scale_tril) 
    
    likelihood = dist.prob(y)  
    likelihood = tf.clip_by_value(likelihood,minval,maxval)

    NLL = -1*tf.math.log(likelihood)

    cov_matrix = tf.matmul(scale_tril, tf.transpose(scale_tril, [0, 2, 1])) 
    variances = tf.linalg.diag_part(cov_matrix)
    stds = tf.sqrt(variances + minval)

    sigma_regularizer_1 = tf.reduce_sum(stds, axis=1)

    batch_size = tf.shape(y)[0]
    
    track_loss_values(NLL, sigma_regularizer_1)

    total_loss = NLL + (sigma_regularizer_1 * reg_weight)
    
    return tf.keras.backend.sum(total_loss)

In [None]:
# Scheduler that scales and saves the nll and reg mean loss
class EpochValidationSaver(tf.keras.callbacks.Callback):
    def __init__(self, val_data, reg_weight):
        super().__init__()
        self.val_data = val_data
        self.reg_weight = reg_weight
        self.intermediate_points = []

    def on_epoch_begin(self, epoch, logs=None):
        reset_loss_trackers()

    def on_epoch_end(self, epoch, logs=None):
        for x_val, y_val in self.val_data:
            y_pred = self.model(x_val, training=False)

            mu = y_pred[:, 0:8:2]
            Mdia = 1e-9 + tf.math.maximum(y_pred[:, 1:8:2], 0.0)
            Mcov = y_pred[:, 8:]

            zeros = tf.zeros_like(Mdia[:, 0])
            row1 = tf.stack([Mdia[:, 0], zeros, zeros, zeros], axis=1)
            row2 = tf.stack([Mcov[:, 0], Mdia[:, 1], zeros, zeros], axis=1)
            row3 = tf.stack([Mcov[:, 1], Mcov[:, 2], Mdia[:, 2], zeros], axis=1)
            row4 = tf.stack([Mcov[:, 3], Mcov[:, 4], Mcov[:, 5], Mdia[:, 3]], axis=1)
            scale_tril = tf.stack([row1, row2, row3, row4], axis=1)

            dist = tfp.distributions.MultivariateNormalTriL(loc=mu, scale_tril=scale_tril)
            likelihood = tf.clip_by_value(dist.prob(y_val), 1e-9, 1e9)

            NLL = -tf.math.log(likelihood)
            cov_matrix = tf.matmul(scale_tril, tf.transpose(scale_tril, [0, 2, 1]))
            stds = tf.sqrt(tf.linalg.diag_part(cov_matrix) + 1e-9)
            sigma_regularizer_1 = tf.reduce_sum(stds, axis=1)

            track_loss_values(NLL, sigma_regularizer_1)

        # Calculates total samples in val_data
        num_val_samples = sum(x.shape[0] for x, _ in self.val_data)

        # Obtain mean metrics per sample (accumulates values batch by batch and computes the average per sample)
        # This approach is not affected by batch size/shuffling of the dataset.
        metrics = get_loss_metrics()
        nll_mean = metrics['nll_mean']
        reg_mean = metrics['reg_mean']

        nll_total = nll_mean * num_val_samples
        reg_total = reg_mean * num_val_samples
        reg_weight_value = float(self.reg_weight.numpy())
        total_loss = nll_total + reg_weight_value * reg_total

        self.keras_style_val_loss = logs.get("val_loss")
        # scale the losses
        logs['val_loss'] = total_loss / num_val_samples

        print(f"[Epoch {epoch+1}] NLL_mean={nll_mean:.6f}, REG_mean={reg_mean:.6f}")
        self.intermediate_points.append((nll_mean, reg_mean))

def get_epoch_callback(validation_generator, reg_weight):
    saver = EpochValidationSaver(val_data=validation_generator, reg_weight=reg_weight)
    return saver, saver

# Scheduler that earlystops trials with non-improving loss
class EarlyStopNoImprovement(tf.keras.callbacks.Callback):
    def __init__(self, patience=2, min_delta=0.1, threshold=19.0, min_val_loss_to_keep=10.0):
        super().__init__()
        self.patience = patience
        self.min_delta = min_delta
        self.threshold = threshold
        self.min_val_loss_to_keep = min_val_loss_to_keep
        self.best_loss = float('inf')
        self.wait = 0
        self.early_stop_triggered = False
        self.bad_trial_due_to_high_loss = False 

    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get("val_loss")
        if val_loss is None:
            return

        if val_loss <= self.threshold:
            print(f"val_loss={val_loss:.4f} is below the threshold ({self.threshold}), early stopping is not applied.")
            return

        if val_loss < self.best_loss - self.min_delta:
            print(f"Improvement detected: val_loss decreased from {self.best_loss:.4f} to {val_loss:.4f}")
            self.best_loss = val_loss
            self.wait = 0
        else:
            self.wait += 1
            print(f"No significant improvement for {self.wait} epochs (val_loss={val_loss:.4f})")

        if self.wait >= self.patience:
            print(f"Early stopping at epoch {epoch+1} due to stagnation (val_loss={val_loss:.4f})")
            self.early_stop_triggered = True
            if self.best_loss > self.min_val_loss_to_keep:
                self.bad_trial_due_to_high_loss = True
            self.model.stop_training = True

In [None]:
# Funtion to choose the scheduler you want to study
# You can add more in the schedulers.py file and add an extra elif here

def get_scheduler(scheduler_type, reg_weight_var, **kwargs):
    if scheduler_type == "cosine":
        return CosineScheduler(**kwargs, reg_weight_var=reg_weight_var)
    elif scheduler_type == "linear":
        return LinearScheduler(**kwargs, reg_weight_var=reg_weight_var)
    elif scheduler_type == "adaptive":
        return AdaptiveScheduler(**kwargs, reg_weight_var=reg_weight_var)
    elif scheduler_type == "sigmoid":
        return SigmoidScheduler(**kwargs, reg_weight_var=reg_weight_var)
    else:
        raise ValueError(f"Unknown scheduler type: {scheduler_type}")

In [None]:
# ==Folder path where your results will save!!
intermediate_dir = "/home/callea/smart-pixels-ml/intermediate_logs"
os.makedirs(intermediate_dir, exist_ok=True)

def objective_manual(trial_id, lambda_init, lambda_final, stop_threshold, experiment_name, scheduler_type, scheduler_kwargs, epochs):
    global intermediate_dir
    reset_loss_trackers()
    current_reg_weight.assign(scheduler_kwargs['start'])

    # Create and compile model
    input_shape = (16, 16, 2)
    model = CreateModel(input_shape, n_filters=5, pool_size=3)
    model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=1e-3), loss=custom_loss)

    # Data generators
    training_generator = OptimizedDataGenerator(
        load_from_tfrecords_dir=tfrecords_dir_train,
        shuffle=True, seed=13, quantize=True
    )
    validation_generator = OptimizedDataGenerator(
        load_from_tfrecords_dir=tfrecords_dir_validation,
        shuffle=False, seed=13, quantize=True
    )

    # Callbacks
    val_callback, saver = get_epoch_callback(validation_generator, reg_weight=current_reg_weight)
    
    if scheduler_type == "adaptive":
        valid_scheduler_keys = {"start", "step", "patience"}
    else:
        valid_scheduler_keys = {"start", "end", "max_epochs", "step", "patience", "sharpness"}
    
    filtered_kwargs = {k: v for k, v in scheduler_kwargs.items() if k in valid_scheduler_keys}
    
    scheduler = get_scheduler(
        scheduler_type=scheduler_type,
        reg_weight_var=current_reg_weight,
        **filtered_kwargs
    )

    bad_loss = EarlyStopNoImprovement(patience=2, min_delta=0.1)

    # Training
    model.fit(
        training_generator,
        validation_data=validation_generator,
        epochs=scheduler_kwargs.get("max_epochs", epochs),
        callbacks=[scheduler, val_callback, bad_loss],
        verbose=1
    )

    # Intermediate points file path
    # Thoughout the training of a trial, every nll x reg validation loss point will save. These are called intermediate points.
    exp_dir = os.path.join(intermediate_dir, experiment_name)
    os.makedirs(exp_dir, exist_ok=True)
    inter_path = os.path.join(exp_dir, f"trial_{trial_id}_intermediate.pkl")
    
    keras_val_loss = val_callback.keras_style_val_loss

    # Trial info structure
    trial_info = {
        "trial_id": trial_id,
        "scheduler": scheduler_type,
        **scheduler_kwargs,
        "valid_trial": False,
        "final_nll": None,
        "final_reg": None,
        "final_val_loss": None,
        "keras_val_loss": keras_val_loss
    }

    if getattr(bad_loss, "bad_trial_due_to_high_loss", False):
        print(f"🗑️ Trial {trial_id} discarded due to high loss")
        if os.path.exists(inter_path):
            os.remove(inter_path)
    else:
        with open(inter_path, "wb") as f:
            pickle.dump(saver.intermediate_points, f)

        metrics = get_loss_metrics()
        nll, reg = float(metrics["nll_mean"]), float(metrics["reg_mean"])
        final_lambda = float(current_reg_weight.numpy())
        
        MAX_NLL_TO_KEEP = 15.0
        if nll > MAX_NLL_TO_KEEP:
            print(f"🗑️ Trial {trial_id} discarded due to high NLL: {nll:.2f}")
            os.remove(inter_path)
            trial_info.update({"final_nll": nll, "final_reg": reg})
        else:
            val_loss_final = nll + final_lambda * reg
            print(f"✅ Trial {trial_id} succeeded")
            trial_info.update({
                "valid_trial": True,
                "final_nll": nll,
                "final_reg": reg,
                "final_val_loss": val_loss_final
            })

    # Save csv
    # Non-improving trials will NOT save their intermediate points & loss info on the csv 
    csv_path = os.path.join(exp_dir, "info.csv")
    write_header = not os.path.exists(csv_path)

    with open(csv_path, mode="a", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=trial_info.keys())
        if write_header:
            writer.writeheader()
        writer.writerow(trial_info)

    return (trial_info["final_nll"], trial_info["final_reg"]) if trial_info["valid_trial"] else (None, None)

In [None]:
# Trial runner 
def run_trials(scheduler_type, experiment_name, num_trials, epochs):
    successful_trials = 0
    trial_id = 0

    while successful_trials < num_trials:
        
        # ==Sample hyperparameters (randomly selected)
        # Modify the ranges you want so you can explore your scheduler's behavior.
        # If you add a new scheduler and it has new parameters, be sure to introduce them here
        
        start_val = random.uniform(0.01, 3.0)
        config = {
            "start": start_val,
            "end": random.uniform(start_val + 0.01, 8.0),
            "stop_threshold": random.uniform(-40000.0, -20000.0),
            "max_epochs": epochs
        }
        if scheduler_type == "adaptive":
            config["step"] = random.uniform(0.01, 0.2)
            config["patience"] = random.randint(3, 7)
        elif scheduler_type == "sigmoid":
            config["sharpness"] = random.randint(2, 15)

        print(f"\n🔁 Training Trial {trial_id} using scheduler={scheduler_type}")

        lambda_init = config["start"]
        lambda_final = config.get("end", None)
        stop_threshold = config["stop_threshold"]

        nll, reg = objective_manual(
            trial_id=trial_id,
            lambda_init=lambda_init,
            lambda_final=lambda_final,
            stop_threshold=stop_threshold,
            experiment_name=experiment_name,
            scheduler_type=scheduler_type,
            scheduler_kwargs=config,
            epochs=epochs
        )

        if nll is not None:
            successful_trials += 1

        trial_id += 1

# == Modify this block for each study you make 
# If you want very good pull/truth plots I recommend setting a lot of epochs ~ 500
run_trials(
    scheduler_type="cosine",
    experiment_name="general_test",
    num_trials=3, 
    epochs=500,
)