<a href="https://colab.research.google.com/github/federicovilla55/optML_mini_project/blob/main/Overparametrized_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [1]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Collecting pyarrow (from openml)
  Downloading pyarrow-19.0.1-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.22.0-cp37-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
Downloading minio-7.2.15-p

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
import time
from getpass import getpass
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42

###GIT

In [3]:
!git config --global user.email "caspar.amery@gmail.com"
!git config --global user.name "casparamery"

In [4]:
token = getpass("Enter your GitHub token: ")
repo_url = f"https://{token}@github.com/federicovilla55/optML_mini_project.git"

!git clone {repo_url}
%cd optML_mini_project
!git status

Enter your GitHub token: ··········
Cloning into 'optML_mini_project'...
remote: Enumerating objects: 289, done.[K
remote: Counting objects: 100% (289/289), done.[K
remote: Compressing objects: 100% (199/199), done.[K
remote: Total 289 (delta 100), reused 270 (delta 86), pack-reused 0 (from 0)[K
Receiving objects: 100% (289/289), 7.02 MiB | 25.15 MiB/s, done.
Resolving deltas: 100% (100/100), done.
/content/optML_mini_project
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [5]:
# To create a branch:

#!git checkout -b branch_name

# Switch to an existing branch:

!git checkout main

#List all branches:

#!git branch

#Push a new branch to GitHub (if needed):

#!git push -u origin branch_name

Already on 'main'
Your branch is up to date with 'origin/main'.


# Dataset Creation

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split

# ---------------------------------------------------------------------
# OVERPARAMETERIZED DATASETS WITH FEATURE BOUNDS [-3, 3]
#
# Features X are sampled uniformly in [-3, 3], ensuring all inputs
# lie within the specified bounds before any polynomial transformations.
# ---------------------------------------------------------------------

def create_linear_dataset(n_samples=100, n_features=110, noise=0.0, random_state=None): #Currently noise set to 0
    """
    Overparameterized linear regression dataset:
      - X sampled U(-3, 3)
      - y = X @ w_true + noise
    """
    rng = np.random.RandomState(random_state)
    X = rng.uniform(low=-3, high=3, size=(n_samples, n_features))
    w_true = rng.randn(n_features)
    y = X.dot(w_true) + noise * rng.randn(n_samples)
    return X, y

def create_poly_varied_dataset(n_samples=100, n_features=110,
                               max_degree=4, noise=0.0, random_state=None): #Currently noise set to 0
    """
    Overparameterized nonlinear regression dataset:
      - X sampled U(-3, 3)
      - Each feature i raised to its own degree_i ∈ [1, max_degree]
      - y = sum_i w_true[i] * (X[:,i] ** degree_i) + noise
    """
    rng = np.random.RandomState(random_state)
    X = rng.uniform(low=-3, high=3, size=(n_samples, n_features))
    w_true = rng.randn(n_features)
    degrees = rng.randint(1, max_degree + 1, size=n_features)
    X_pow = np.zeros_like(X)
    for i, d in enumerate(degrees):
        X_pow[:, i] = X[:, i] ** d
    y = X_pow.dot(w_true) + noise * rng.randn(n_samples)
    return X, y, degrees

def split_data(X, y, val_size=0.2, test_size=0.2, random_state=None):
    """
    Splits data into train (60%), validation (20%), and test (20%) sets.
    """
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state)
    val_rel = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_rel, random_state=random_state)
    return X_train, y_train, X_val, y_val, X_test, y_test

# ------------------------------
# Example: generate & split both
# ------------------------------
# Linear dataset
X_lin, y_lin = create_linear_dataset(n_samples=100, n_features=110, noise=0.0,random_state=42)
lin_splits = split_data(X_lin, y_lin, val_size=0.01, test_size=0.2, random_state=42) # Overparametrized cases always converge to 0 loss so no need for hyperparameter search!
print("Linear shapes:", [arr.shape for arr in lin_splits])

# Polynomial-varied dataset
X_poly, y_poly, degrees = create_poly_varied_dataset(
    n_samples=100, n_features=110, max_degree=4, noise=0.0, random_state=42)
poly_splits = split_data(X_poly, y_poly, val_size=0.01, test_size=0.2, random_state=42)  # Overparametrized cases always converge to 0 loss so no need for hyperparameter search!
print("Poly-varied shapes:", [arr.shape for arr in poly_splits])
print("Sample feature degrees:", degrees[:10])


Linear shapes: [(79, 110), (79,), (1, 110), (1,), (20, 110), (20,)]
Poly-varied shapes: [(79, 110), (79,), (1, 110), (1,), (20, 110), (20,)]
Sample feature degrees: [1 3 4 1 2 1 4 3 2 3]


# **Linear Case Experiments:**

  Gradient Descent convergence

Gradient descent on a convex quadratic converges to the global minimizer (here a zero‐loss solution) as long as you pick a step‐size
$$
\eta < \frac{2}{\lambda_{\max}(X^T X)},
$$
where $\lambda_{\max}(X^T X)$ is the largest eigenvalue of $(X^T X)$.
In the limit of small enough
𝜂
η and enough iterations (full‐batch GD), you will drive the training loss to (arbitrarily close to) zero.

  Training + evaluating all the different kind of regularization

  - Baseline
  - Dropout
  - Weight Decay/L2
  - Gradient Noise Injection
  - ....

In [18]:
from numpy.linalg import svd

X_tr_lin, y_tr_lin, X_val_lin, y_val_lin, X_te_lin, y_te_lin = lin_splits
X_comb = np.vstack([X_tr_lin, X_val_lin])
y_comb = np.concatenate([y_tr_lin, y_val_lin])
n, d = X_comb.shape
rng = np.random.RandomState(42)
scale = 5   # avoids huge outliers
# Amount of initializations
init_ws = rng.uniform(-scale, scale, size=(1, d))
np.save('linear_init_weights.npy', init_ws)

# 3) Compute 95% of max stable step size η₉₅
_, S_comb, _ = svd(X_comb, full_matrices=False)
eta_max = 2.0 / (S_comb[0]**2)
eta_95  = 0.95 * eta_max

###Base line

In [32]:
import numpy as np
from numpy.linalg import svd
def train_linear_gd(X, y, eta, w_init, b_init=0.0,
                    max_epochs=20000, tol=1e-8, patience=100):
    """
    Full-batch GD on y = Xw + b with early stopping.
    w_init: numpy array of shape (d,)
    """
    n, d = X.shape
    w = w_init.copy()
    b = b_init
    best_loss = np.inf
    no_imp = 0
    for epoch in range(max_epochs):
        preds = X.dot(w) + b
        err   = preds - y
        loss  = np.mean(err**2)
        if epoch % 5000 ==0:
            print(epoch)
            print("=")
            print(loss)
        # Early stop check
        if loss + tol < best_loss:
            best_loss, no_imp = loss, 0
        else:
            no_imp += 1
        if best_loss < tol or no_imp >= patience:
            break
        # Gradients
        grad_w = (2.0 / n) * X.T.dot(err)
        grad_b = (2.0 / n) * np.sum(err)
        # Update
        w -= eta * grad_w
        b -= eta * grad_b
    return w, b

def mse_loss(X, y, w, b):
    """Mean squared error."""
    return np.mean((X.dot(w) + b - y)**2)


# 6) Loop over all 100 initializations
test_losses = []
for idx, w0 in enumerate(init_ws, start=1):
    # train from this initialization
    w_star, b_star = train_linear_gd(X_comb, y_comb, eta_95, w_init=w0, b_init=0.0)
    # evaluate on test set
    test_mse = mse_loss(X_te_lin, y_te_lin, w_star, b_star)
    test_losses.append(test_mse)
    print(f"[Init {idx:03d}/100] → Test MSE = {test_mse:.6f}")

# 7) Average test loss
avg_mse = np.mean(test_losses)
print(f"\nAverage Test MSE over 100 inits: {avg_mse:.6e}")

# init_ws is stored in 'linear_init_weights.npy' for later reuse

0
=
2739.812087470125
5000
=
0.006645278810807602
10000
=
4.4369234540546295e-05
15000
=
6.49119147388956e-07
[Init 001/100] → Test MSE = 2319.923831

Average Test MSE over 100 inits: 2.319924e+03


### ASGD case

In [79]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.multiprocessing as mp

def asgd_worker(rank, model, optimizer, shared, train_dataset,
                batch_size, max_epochs, max_no_improve, epsilon, device):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    for epoch in range(max_epochs):
        if shared.stop:
            break
        # one epoch of Hogwild SGD updates
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(Xb)         # preds.shape = [batch, 1]
            loss = nn.MSELoss()(preds, yb)  # yb.shape = [batch, 1]
            loss.backward()
            optimizer.step()
        # rank 0 worker does early‐stop check
        if rank == 0:
            total_loss = 0.0
            count = 0
            for Xall, yall in DataLoader(train_dataset, batch_size=batch_size):
                Xall, yall = Xall.to(device), yall.to(device)
                with torch.no_grad():
                    pred = model(Xall)
                    total_loss += nn.MSELoss(reduction='sum')(pred, yall).item()
                count += yall.size(0)
            train_mse = total_loss / count
            print(f"[Epoch {epoch+1}] Train MSE = {train_mse:.6e}")
            if train_mse + epsilon < shared.best_loss:
                shared.best_loss = train_mse
                shared.epochs_no_improve = 0
            else:
                shared.epochs_no_improve += 1
            if shared.best_loss < epsilon or shared.epochs_no_improve >= max_no_improve:
                shared.stop = True
    print(f"Worker {rank} exiting")


def train_asgd_linear(w0,lin_splits, workers=1, max_epochs=100000,
                      max_no_improve=1000, epsilon=1e-6, device='cpu'):
    # unpack & combine train+val, and UNIFY target shapes to [N,1]
    X_tr, y_tr, X_val, y_val, X_te, y_te = lin_splits
    X_comb = np.vstack([X_tr, X_val])
    y_comb = np.concatenate([y_tr, y_val])[:, None]  # now shape (N,1)
    train_ds = TensorDataset(torch.from_numpy(X_comb).float(),
                             torch.from_numpy(y_comb).float())
    test_ds  = TensorDataset(torch.from_numpy(X_te).float(),
                             torch.from_numpy(y_te[:, None]).float())

    # compute eta_95
    U, S, _ = np.linalg.svd(X_comb, full_matrices=False)
    eta_95 = 0.95 * (2.0 / (S[0]**2))

    # model & optimizer
    input_dim = X_comb.shape[1]
    model = nn.Linear(input_dim, 1, bias=True)
    model.share_memory()
    optimizer = optim.SGD(model.parameters(), lr=eta_95)

    # shared state
    manager = mp.Manager()
    shared = manager.Namespace()
    shared.best_loss = float('inf')
    shared.epochs_no_improve = 0
    shared.stop = False

    mp.set_start_method('fork', force=True)

    batch_size = X_comb.shape[0] # Full batch
    processes = []
    for rank in range(workers):
        p = mp.Process(
            target=asgd_worker,
            args=(
                rank, model, optimizer, shared,
                train_ds, batch_size,
                max_epochs, max_no_improve, epsilon, device
            )
        )
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    # test evaluation
    model.eval()
    total_te = 0.0
    with torch.no_grad():
        for Xb, yb in DataLoader(test_ds, batch_size=batch_size):
            preds = model(Xb.to(device))
            total_te += nn.MSELoss(reduction='sum')(preds, yb.to(device)).item()
    test_mse = total_te / len(test_ds)
    print(f"\nASGD Test MSE = {test_mse:.6e}")
    return model, test_mse


workers = 10
test_losses = []
for idx, w0 in enumerate(init_ws, start=1):
    trained_asgd, test_mse = train_asgd_linear(w0,lin_splits, workers)
    test_losses.append(test_mse)
    print(f"[Init {idx:03d}/100] → Test MSE = {test_mse:.6e}")

# 7) Average test loss
avg_mse = np.mean(test_losses)
print(f"\nAverage Test MSE over 100 inits: {avg_mse:.6e}")


[Epoch 1] Train MSE = 6.161611e+02
[Epoch 2] Train MSE = 2.806573e+02
[Epoch 3] Train MSE = 1.898026e+02
[Epoch 4] Train MSE = 8.408646e+01
[Epoch 5] Train MSE = 2.402593e+01
[Epoch 6] Train MSE = 1.411208e+01
[Epoch 7] Train MSE = 6.994892e+00
[Epoch 8] Train MSE = 4.270392e+00
[Epoch 9] Train MSE = 2.833372e+00
[Epoch 10] Train MSE = 1.907298e+00
[Epoch 11] Train MSE = 1.282774e+00
[Epoch 12] Train MSE = 9.800470e-01
[Epoch 13] Train MSE = 7.313037e-01
[Epoch 14] Train MSE = 5.415344e-01
[Epoch 15] Train MSE = 3.803157e-01
[Epoch 16] Train MSE = 2.975742e-01
[Epoch 17] Train MSE = 2.258846e-01
[Epoch 18] Train MSE = 1.780057e-01
[Epoch 19] Train MSE = 1.427797e-01
[Epoch 20] Train MSE = 1.152652e-01
[Epoch 21] Train MSE = 8.474092e-02
[Epoch 22] Train MSE = 6.770584e-02
[Epoch 23] Train MSE = 5.234907e-02
[Epoch 24] Train MSE = 4.034205e-02
[Epoch 25] Train MSE = 3.192861e-02
[Epoch 26] Train MSE = 2.625798e-02
[Epoch 27] Train MSE = 2.196052e-02
[Epoch 28] Train MSE = 1.788813e-02
[

# **Non-Linear Case Experiments:**
  Training + evaluating all the different kind of regularization

  - Baseline
  - Dropout
  - Weight Decay/L2
  - Gradient Noise Injection
  - ....

###Functions

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time
from tqdm import tqdm

# Helper: select activation
def get_activation_fn(name: str):
    name = name.lower()
    if name == 'relu':
        return nn.ReLU()
    elif name == 'sigmoid':
        return nn.Sigmoid()
    elif name == 'tanh':
        return nn.Tanh()
    elif name == 'linear':
        return nn.Identity()
    else:
        raise ValueError("Unsupported activation. Choose from 'relu','sigmoid','tanh','linear'.")

# Linear regression model
class LinearModel(nn.Module):
    def __init__(self, input_dim, bias=True):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1, bias=bias)
    def forward(self, x):
        return self.linear(x).squeeze(-1)  # shape: (batch,)

# MLP for nonlinear cases
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layers=1, hidden_units=64, activation='relu', bias=True):
        """
        input_dim: number of features
        hidden_layers: number of hidden layers
        hidden_units: units per hidden layer
        activation: nonlinearity name
        bias: whether to use bias terms
        """
        super().__init__()
        act_fn = get_activation_fn(activation)
        layers = []
        in_dim = input_dim
        for _ in range(hidden_layers):
            layers.append(nn.Linear(in_dim, hidden_units, bias=bias))
            layers.append(act_fn)
            in_dim = hidden_units
        layers.append(nn.Linear(in_dim, 1, bias=bias))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)

# Training loop for regression (MSE)
def train_model(model, optimizer, data_loader, epoch, num_epochs, grad_noise_std=0.0, device='cpu'):
    model.train()
    criterion = nn.MSELoss()
    running_loss = 0.0
    for X_batch, y_batch in tqdm(data_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        # optional gradient noise
        if grad_noise_std > 0:
            with torch.no_grad():
                for p in model.parameters():
                    if p.grad is not None:
                        p.grad.add_(torch.randn_like(p.grad) * grad_noise_std)
        optimizer.step()
        running_loss += loss.item() * X_batch.size(0)
    return running_loss / len(data_loader.dataset)

# Evaluation loop (MSE)
def evaluate_model(model, data_loader, device='cpu'):
    model.eval()
    criterion = nn.MSELoss(reduction='sum')
    total_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            preds = model(X_batch)
            total_loss += criterion(preds, y_batch).item()
    return total_loss / len(data_loader.dataset)

# Run experiment wrapper
def run_experiment(name, train_data, val_data, test_data,
                   model_ctor, model_kwargs,
                   learning_rate=1e-3, weight_decay=0.0,
                   batch_size=64, epochs=50, grad_noise_std=0.0,
                   device='cpu'):
    print(f"\n=== Experiment: {name} ===")
    # DataLoaders
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_data,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_data,  batch_size=batch_size, shuffle=False)

    model = model_ctor(**model_kwargs).to(device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    train_losses = []
    val_losses = []
    start = time.time()
    for epoch in range(epochs):
        train_loss = train_model(model, optimizer, train_loader, epoch, epochs, grad_noise_std, device)
        val_loss = evaluate_model(model, val_loader, device)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"Epoch {epoch+1}/{epochs} - Train MSE: {train_loss:.4f}, Val MSE: {val_loss:.4f}")
    duration = time.time() - start
    print(f"Training finished in {duration:.1f}s")

    test_loss = evaluate_model(model, test_loader, device)
    print(f"Test MSE: {test_loss:.4f}")
    return model, train_losses, val_losses, test_loss

# ASGD (Hogwild!) worker
def train_worker(rank, shared_model, optimizer, dataset, epochs, batch_size, grad_noise_std, device):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    shared_model.to(device)
    for epoch in range(epochs):
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            optimizer.zero_grad()
            loss = nn.MSELoss()(shared_model(X_batch), y_batch)
            loss.backward()
            if grad_noise_std > 0:
                with torch.no_grad():
                    for p in shared_model.parameters():
                        if p.grad is not None:
                            p.grad.add_(torch.randn_like(p.grad) * grad_noise_std)
            optimizer.step()
        print(f"Worker {rank} completed epoch {epoch+1}")

def run_asgd(model_ctor, model_kwargs, train_data, learning_rate=1e-3,
             batch_size=64, workers=4, epochs=10, grad_noise_std=0.0, device='cpu'):
    model = model_ctor(**model_kwargs)
    model.share_memory()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    import torch.multiprocessing as mp
    processes = []
    epochs_per_worker = epochs
    for rank in range(workers):
        p = mp.Process(target=train_worker,
                       args=(rank, model, optimizer, train_data, epochs_per_worker, batch_size, grad_noise_std, device))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
    return model



## **Hyperparameter tuning for baseline**

Functions have to be changed !

Using Keras for this because its interface is easier for this

In [None]:
!pip install keras_core
!pip install keras-tuner --upgrade

Collecting keras_core
  Downloading keras_core-0.1.7-py3-none-any.whl.metadata (4.3 kB)
Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras_core
Successfully installed keras_core-0.1.7
Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras_core as keras
#########################################
import tensorflow as tf
#from tensorflow.keras.optimizers import Adadelta
import keras
#from tensorflow.keras import layers
#from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import regularizers
#import tensorflow as tf
#from tensorflow import keras
#from tensorflow.keras import layers
from tensorflow.keras.models import clone_model
from tensorflow.keras.models import load_model

In [80]:
X_train_poly, y_train_poly, X_val_poly, y_val_poly, X_test_poly, y_test_poly = poly_splits

In [None]:
# Define the model as a function for Keras Tuner for regression
def build_model(hp):
    net = keras.Sequential()

    # Input layer
    net.add(keras.layers.Input(shape=(X_train_poly.shape[1],)))

    # Define the activation function to be used for all layers
    activation_function = hp.Choice('activation_function', ['relu', 'tanh', 'sigmoid'])

    # No regularization in base line
    #regularization = hp.Float('regularization', min_value=0.0, max_value=0.1, step=0.005)

    # Tune the number of hidden layers
    for i in range(hp.Int('hidden_layers', 2, 5)):
        # Tune the number of units per layer
        units = hp.Int(f'units_in_layer{i}', min_value=32, max_value=256, step=32)
        net.add(keras.layers.Dense(units=units, activation= activation_function, kernel_regularizer=regularizers.l2(0)))

    # Output layer
    net.add(keras.layers.Dense(units=1, activation= activation_function))

    net.compile(
        optimizer=keras.optimizers.SGD(learning_rate=hp.Float('learning_rate', 1e-3, 1e-1, sampling='log')),
        loss='binary_crossentropy',
        metrics=['accuracy']
        )
    return net

# Define the tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',  # Optimize for validation loss, not accuracy in regression
    max_trials=100,         # Number of hyperparameter combinations to try
    executions_per_trial=1, # Number of times to train each configuration
    directory='/content/optML_mini_project',
    project_name='Hyperparam Search Baseline Model For Non Linear Overparametrized'
)

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',       # Monitor validation loss for early stopping
    patience=5,               # Number of epochs with no improvement before stopping
    restore_best_weights=True # Restore model weights from the epoch with the best validation loss
)


Reloading Tuner from /content/optML_mini_project/Hyperparam Search Baseline Model For Linear Overparametrized/tuner0.json


**Only run this cell if want training/search again => THIS CAN TAKE A LONG TIME**

In [None]:
# Run tuner search with custom loss history and early stopping
for trial_id in range(tuner.oracle.max_trials):
    tuner.search(
        X_train_poly, y_train_poly,
        validation_data=(X_val_poly, y_val_poly),
        epochs=50,
        batch_size=20,
        callbacks=[early_stopping]
        )

Trial 250 Complete [00h 00m 05s]
val_loss: 1600.9865112304688

Best val_loss So Far: 1409.7673950195312
Total elapsed time: 00h 21m 31s


In [None]:
# Print the summary of the search space
tuner.search_space_summary()

# Print the results of the search
tuner.results_summary()                       #WILL RETURN WRONG LAYER SIZES!!!! STARTS OF RIGHT, BUT FILLED WITH ADDITIONAL BS => check amount of layers it says there is

Search space summary
Default search space size: 1
learning_rate (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 0.1, 'step': None, 'sampling': 'log'}
Results summary
Results in /content/optML_mini_project/Hyperparam Search Baseline Model For Linear Overparametrized
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 045 summary
Hyperparameters:
learning_rate: 0.010881800788534846
Score: 1409.7673950195312

Trial 028 summary
Hyperparameters:
learning_rate: 0.016459331291952827
Score: 1433.366943359375

Trial 024 summary
Hyperparameters:
learning_rate: 0.01411254804624754
Score: 1434.1825561523438

Trial 217 summary
Hyperparameters:
learning_rate: 0.013233340167603112
Score: 1439.9340209960938

Trial 092 summary
Hyperparameters:
learning_rate: 0.015256589990246058
Score: 1443.3583984375

Trial 179 summary
Hyperparameters:
learning_rate: 0.001595935236110003
Score: 1444.2477416992188

Trial 242 summary
Hyperparameters:
learning_rate: 0.0

Save results in GIT

In [None]:
%cd /content/optML_mini_project
!git add --a
!git commit -m "Hyperparameter search for linear baseline model"
!git push

c:\content\optML_mini_project


fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


In [None]:
# Retrieve the best model from the tuner
top_models = tuner.get_best_models(num_models=10)
best_model = tuner.get_best_models(num_models=1)[0]

# Display summaries of the top models
for i, model in enumerate(top_models, start=1):
    print(f"\nModel {i} Summary:")
    model.summary()
# Display the optimizer for each model
for i, model in enumerate(top_models, start=1):
    optimizer_config = model.optimizer.get_config()  # Get optimizer configuration
    print(f"\nModel {i} Optimizer:")
    for key, value in optimizer_config.items():
        print(f"  {key}: {value}")



Model 1 Summary:



Model 2 Summary:



Model 3 Summary:



Model 4 Summary:



Model 5 Summary:



Model 6 Summary:



Model 7 Summary:



Model 8 Summary:



Model 9 Summary:



Model 10 Summary:



Model 1 Optimizer:
  name: SGD
  learning_rate: 0.010881801135838032
  weight_decay: None
  clipnorm: None
  global_clipnorm: None
  clipvalue: None
  use_ema: False
  ema_momentum: 0.99
  ema_overwrite_frequency: None
  loss_scale_factor: None
  gradient_accumulation_steps: None
  momentum: 0.0
  nesterov: False

Model 2 Optimizer:
  name: SGD
  learning_rate: 0.016459330916404724
  weight_decay: None
  clipnorm: None
  global_clipnorm: None
  clipvalue: None
  use_ema: False
  ema_momentum: 0.99
  ema_overwrite_frequency: None
  loss_scale_factor: None
  gradient_accumulation_steps: None
  momentum: 0.0
  nesterov: False

Model 3 Optimizer:
  name: SGD
  learning_rate: 0.014112547971308231
  weight_decay: None
  clipnorm: None
  global_clipnorm: None
  clipvalue: None
  use_ema: False
  ema_momentum: 0.99
  ema_overwrite_frequency: None
  loss_scale_factor: None
  gradient_accumulation_steps: None
  momentum: 0.0
  nesterov: False

Model 4 Optimizer:
  name: SGD
  learning_rate: 0.0

If need to save the best model