In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Simulated data
np.random.seed(42)
X = np.random.rand(100, 5)  # 100 samples, 5 features
y = X @ np.array([1.5, -2.0, 1.0, 0.5, -1.0]) + np.random.normal(0, 0.1, 100)  # Linear relationship with noise

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Range of lambda (regularization parameter) values
lambdas = np.logspace(-4, 4, 100)
validation_mse = []

# Ridge regression analytical solution
def ridge_regression(X, y, lam):
    n_features = X.shape[1]
    I = np.eye(n_features)
    return np.linalg.inv(X.T @ X + lam * I) @ X.T @ y

# Iterate over lambda values
for lam in lambdas:
    beta = ridge_regression(X_train, y_train, lam)  # Compute coefficients
    y_val_pred = X_validation @ beta  # Predict on validation set
    mse = mean_squared_error(y_validation, y_val_pred)  # Compute validation MSE
    validation_mse.append(mse)

# Find optimal lambda
optimal_lambda = lambdas[np.argmin(validation_mse)]
print(f"Optimal lambda: {optimal_lambda}")

# Refit the model with training + validation data
X_combined = np.vstack((X_train, X_validation))
y_combined = np.hstack((y_train, y_validation))
final_beta = ridge_regression(X_combined, y_combined, optimal_lambda)

# Evaluate on the test set
y_test_pred = X_test @ final_beta
test_mse = mean_squared_error(y_test, y_test_pred)
print(f"Test MSE: {test_mse}")


Optimal lambda: 0.09770099572992257
Test MSE: 0.007625739346510332


In [8]:
print(X.shape)

(100, 5)
