In [None]:
# Change directory to the root so that relative path loads work correctly
import os

try:
    os.chdir(os.path.join(os.getcwd(), ".."))
    print(os.getcwd())
except:
    pass

In [None]:
import glob
from datetime import datetime

import numpy as np
import torch
from torch import nn

from experiments.A_proof_of_constraint.experiment_definition import dictionary_product
from experiments.A_proof_of_constraint.main import (
    build_model_and_optimizer,
    run_experiment,
)
from experiments.A_proof_of_constraint.model import Dense, ParameterizedDense
from experiments.A_proof_of_constraint.reductions import Huber_Reduction, Lp_Reduction

In [None]:
# Saving utilities
def get_savefile(configuration):
    base_name = experiment_name
    method = configuration["method"]
    model_size = configuration["model_size"][0]
    batch_size = configuration["batch_size"]
    savefile = f"{base_name}_{method}_batchsize{batch_size}_modelsize{model_size}.pth"
    return savefile


experiment_name = "timing"
save_directory = f"results/checkpoints/{experiment_name}/"

In [None]:
base_configuration = {
    "training_parameterizations": {
        "amplitudes": [1.0],
        "frequencies": [1.0],
        "phases": [0.0],
        "num_points": None,
        "sampling": "uniform",
    },
    "testing_parameterizations": {
        "amplitudes": [1.0],
        "frequencies": [1.0],
        "phases": [0.0],
        "num_points": 1,
        "sampling": "uniform",
    },
    "batch_size": 1000,
    "architecture": Dense,
    "model_size": [50, 50, 50, 50, 50],
    "method": "constrained",
    "learning_rate": 1e-3,
    "ground_approximation": None,
    "reduction": Huber_Reduction(6),
    "model_act": nn.Tanh(),
}


def fix_configuration(configuration):
    # Sets things so we have exactly 100 iterations in the single epoch
    configuration["training_parameterizations"]["num_points"] = (
        configuration["batch_size"] * 100
    )
    return configuration


num_epochs = 1
save_interval = 1

In [None]:
# Definition of experiment
configuration_revisions = list()

# sub_experiment: batch_size
configuration_revisions.extend(
    dictionary_product(
        **{
            "method": ["unconstrained", "soft-constrained", "reduction", "constrained"],
            #             "method": ["unconstrained", "soft-constrained", "reduction"],
            "batch_size": [10, 50, 100, 500, 1000],
            "model_size": [[100]],
        }
    )
)
# sub_experiment: model_size (without duplication of the batch_size=100, model_size=100 case)
configuration_revisions.extend(
    dictionary_product(
        **{
            "method": ["unconstrained", "soft-constrained", "reduction", "constrained"],
            #             "method": ["unconstrained", "soft-constrained", "reduction"],
            "batch_size": [100],
            "model_size": [[10], [50], [500], [1000]],
        }
    )
)

In [None]:
# Warning: the experiment will take a few hours to run on a standard processor
rerun_experiment = False

# Delete old saves
if rerun_experiment:
    files = glob.glob(f"{save_directory}/*.pth")
    for f in files:
        os.remove(f)

In [None]:
# Run experiment


def dont_print(*args):
    # Literally do nothing and let the print statement die
    pass


if rerun_experiment:
    final_checkpoints = list()
    for revision in configuration_revisions:
        configuration = base_configuration.copy()
        configuration.update(revision)
        configuration = fix_configuration(configuration)

        savefile = get_savefile(configuration)
        print(f"Running proof of constraint with savefile {savefile}")
        checkpoint_save_file_base = os.path.splitext(savefile)[0]
        final_checkpoints.append(f"{checkpoint_save_file_base}_{num_epochs:05d}.pth")
        final_result = run_experiment(
            num_epochs,
            evaluate_training=False,  # we only care about training time
            evaluate_testing=False,
            log=dont_print,
            save_directory=save_directory,
            save_file=checkpoint_save_file_base,
            save_interval=save_interval,
            **configuration,
        )
        print(f"Completed run with savefile {savefile}")
    print("")
    print(f"Checkpoints were saved to {final_checkpoints}")
    print(f"Corresponding revisions {configuration_revisions}")

In [None]:
# visualize experiment
from experiments.A_proof_of_constraint.visualize import plot_time_experiment


def get_model_name(checkpoint):
    method = checkpoint["configuration"]["method"]
    return f"{method}"

In [None]:
# Load files
files = glob.glob(f"{save_directory}/*.pth")
files.sort()
checkpoints = [torch.load(f) for f in files]
model_names = [get_model_name(checkpoint) for checkpoint in checkpoints]
print(model_names)
# Make sure directory to save exists
plot_directory = f"/global/u1/g/gelijerg/Projects/pyinsulate/results/{experiment_name}/"
os.makedirs(plot_directory, exist_ok=True)

In [None]:
# Sort the checkpoints for each sub-experiment
def get_batch_size_experiment(checkpoints, model_names, mask_out=[]):
    configurations = np.array(
        [checkpoint["configuration"] for checkpoint in checkpoints]
    )
    mask = np.array(
        [configuration["model_size"][0] == 100 for configuration in configurations]
    )
    idxs = np.argsort(
        [configuration["batch_size"] for configuration in configurations[mask]]
    )
    exp_checkpoints = np.array(checkpoints)[mask][idxs]
    exp_model_names = np.array(model_names)[mask][idxs]
    grouped_checkpoints = list()
    # reverse the sorting so that unconstrained comes first
    grouped_labels = np.sort(np.unique(exp_model_names))[::-1]
    grouped_labels = [label for label in grouped_labels if label not in mask_out]
    for unique_name in grouped_labels:
        grouped_checkpoints.append(exp_checkpoints[exp_model_names == unique_name])
    return grouped_checkpoints, grouped_labels


def get_model_size_experiment(checkpoints, model_names, mask_out=[]):
    configurations = np.array(
        [checkpoint["configuration"] for checkpoint in checkpoints]
    )
    mask = np.array(
        [configuration["batch_size"] == 100 for configuration in configurations]
    )
    idxs = np.argsort(
        [configuration["model_size"][0] for configuration in configurations[mask]]
    )
    exp_checkpoints = np.array(checkpoints)[mask][idxs]
    exp_model_names = np.array(model_names)[mask][idxs]
    grouped_checkpoints = list()
    # reverse the sorting so that unconstrained comes first
    grouped_labels = np.sort(np.unique(exp_model_names))[::-1]
    grouped_labels = [label for label in grouped_labels if label not in mask_out]
    for unique_name in grouped_labels:
        grouped_checkpoints.append(exp_checkpoints[exp_model_names == unique_name])
    return grouped_checkpoints, grouped_labels

In [None]:
def get_model_size(checkpoint):
    # gets how many parameters the model actually has
    model, opt = build_model_and_optimizer(checkpoint["configuration"])
    model.load_state_dict(checkpoint["model_state_dict"])
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
# Actually plot
batch_size_experiment_checkpoints, batch_size_experiment_model_names = get_batch_size_experiment(
    checkpoints, model_names
)
fig = plot_time_experiment(
    [
        [checkpoint["monitors"][0] for checkpoint in group]
        for group in batch_size_experiment_checkpoints
    ],
    batch_size_experiment_model_names,
    xvalues=[
        checkpoint["configuration"]["batch_size"]
        for checkpoint in batch_size_experiment_checkpoints[0]
    ],
    savefile=f"batch_size_dependence_all",
    title="Training time scaling with batch size",
    ylabel="Seconds per iteration",
    xlabel="Batch size",
    confidence_interval=95.0,
)
batch_size_experiment_checkpoints, batch_size_experiment_model_names = get_batch_size_experiment(
    checkpoints, model_names, mask_out=["constrained"]
)
fig = plot_time_experiment(
    [
        [checkpoint["monitors"][0] for checkpoint in group]
        for group in batch_size_experiment_checkpoints
    ],
    batch_size_experiment_model_names,
    xvalues=[
        checkpoint["configuration"]["batch_size"]
        for checkpoint in batch_size_experiment_checkpoints[0]
    ],
    savefile=f"batch_size_dependence_small",
    title="Training time scaling with batch size",
    ylabel="Seconds per iteration",
    xlabel="Batch size",
    confidence_interval=95.0,
)

model_size_experiment_checkpoints, model_size_experiment_model_names = get_model_size_experiment(
    checkpoints, model_names
)
fig = plot_time_experiment(
    [
        [checkpoint["monitors"][0] for checkpoint in group]
        for group in model_size_experiment_checkpoints
    ],
    model_size_experiment_model_names,
    xvalues=[
        get_model_size(checkpoint)
        for checkpoint in model_size_experiment_checkpoints[0]
    ],
    savefile=f"model_size_dependence_all",
    title="Training time scaling with model size",
    ylabel="Seconds per iteration",
    xlabel="Number of trainable paramters",
    confidence_interval=95.0,
)
model_size_experiment_checkpoints, model_size_experiment_model_names = get_model_size_experiment(
    checkpoints, model_names, mask_out=["constrained"]
)
fig = plot_time_experiment(
    [
        [checkpoint["monitors"][0] for checkpoint in group]
        for group in model_size_experiment_checkpoints
    ],
    model_size_experiment_model_names,
    xvalues=[
        get_model_size(checkpoint)
        for checkpoint in model_size_experiment_checkpoints[0]
    ],
    savefile=f"model_size_dependence_small",
    title="Training time scaling with model size",
    ylabel="Seconds per iteration",
    xlabel="Number of trainable paramters",
    confidence_interval=95.0,
)
print("done!")