In [None]:
## L2-regularized linear regression via stochastic gradient descent

In [2]:
# Considering the function designed in question 4
import numpy as np

def ridge_regression_sgd(X_train, y_train, X_val, y_val, alpha_vals, lr_vals, batch_sizes, epochs):
    """
    Trains an L2-regularized linear regression model using SGD.

    Parameters:
    - X_train: Training features (n_samples, n_features)
    - y_train: Training labels (n_samples,)
    - X_val: Validation features (n_samples, n_features)
    - y_val: Validation labels (n_samples,)
    - alpha_vals: List of regularization strengths to try
    - lr_vals: List of learning rates to try
    - batch_sizes: List of batch sizes to try
    - epochs: Number of epochs to run for each combination

    Returns:
    - Best weights and bias based on validation loss
    """

    best_w, best_b = None, None
    best_mse = float("inf")
    
    for alpha in alpha_vals:
        for lr in lr_vals:
            for batch_size in batch_sizes:
                # Initialize weights and bias
                w = np.zeros(X_train.shape[1])
                b = 0
                
                for epoch in range(epochs):
                    # Shuffle data
                    indices = np.arange(X_train.shape[0])
                    np.random.shuffle(indices)
                    X_train, y_train = X_train[indices], y_train[indices]
                    
                    # Mini-batch training
                    for i in range(0, X_train.shape[0], batch_size):
                        X_batch = X_train[i:i+batch_size]
                        y_batch = y_train[i:i+batch_size]

                        # Compute predictions
                        preds = X_batch.dot(w) + b
                        errors = preds - y_batch

                        # Compute gradients
                        grad_w = (2 / batch_size) * (X_batch.T.dot(errors)) + 2 * alpha * w
                        grad_b = (2 / batch_size) * np.sum(errors)

                        # Update parameters
                        w -= lr * grad_w
                        b -= lr * grad_b

                # Compute validation loss (unregularized MSE)
                val_preds = X_val.dot(w) + b
                val_mse = np.mean((y_val - val_preds) ** 2)

                # Update best parameters if current config is better
                if val_mse < best_mse:
                    best_mse = val_mse
                    best_w, best_b = w, b

    return best_w, best_b, best_mse

In [16]:
# Load dataset 
X_train = np.load("data/age_regression_Xtr.npy")
X_train = X_train.reshape(X_train.shape[0], -1)  # reshape to 2D
y_train = np.load("data/age_regression_ytr.npy")

In [18]:
# Split train and validation sets (80/20)
split_idx = int(0.8 * X_train.shape[0])
X_val, y_val = X_train[split_idx:], y_train[split_idx:]
X_train, y_train = X_train[:split_idx], y_train[:split_idx]

In [19]:
# Define hyperparameter values
alpha_vals = [0.01, 0.1, 1, 10]
lr = [0.001, 0.01, 0.1]
batch = [8, 16, 32]
epochs = 100

In [None]:
# Train model
best_w, best_b, best_mse = ridge_regression_sgd(X_train, y_train, X_val, y_val, alpha_vals, lr, batch, epochs)

# Print best results
print("Best weights:", best_w)
print("Best bias:", best_b)
print("Best validation MSE:", best_mse)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  w -= lr * grad_w
  b -= lr * grad_b


Best weights: [ 1.13468049  0.75873297  0.4128094  ... -0.25656862 -0.18807008
 -0.2141979 ]
Best bias: 36.5783102349734
Best validation MSE: 173.18378906888458
