# Week 04 â€” Regularization & Validation

This notebook covers techniques to control overfitting and select models robustly. You'll:
- Understand and implement L1/L2 regularization
- Master cross-validation strategies
- Apply time-series aware validation techniques

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet
from sklearn.model_selection import cross_val_score, KFold, TimeSeriesSplit
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
%matplotlib inline

np.random.seed(42)
print("Libraries imported!")

## 1. Ridge vs Lasso Regularization

Compare L2 (Ridge) and L1 (Lasso) regularization on synthetic data.

In [None]:
# Generate data with correlated features
n_samples = 100
n_features = 50
X, y = make_regression(n_samples=n_samples, n_features=n_features, 
                       n_informative=10, noise=10, random_state=42)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Ridge and Lasso
alpha = 1.0
ridge = Ridge(alpha=alpha)
lasso = Lasso(alpha=alpha)

ridge.fit(X_scaled, y)
lasso.fit(X_scaled, y)

# Compare coefficients
plt.figure(figsize=(14, 5))

plt.subplot(1, 3, 1)
plt.stem(ridge.coef_, linefmt='b-', markerfmt='bo', basefmt=' ')
plt.title(f'Ridge Coefficients (L2)\nNon-zero: {np.sum(np.abs(ridge.coef_) > 1e-3)}')
plt.xlabel('Feature Index')
plt.ylabel('Coefficient Value')
plt.grid(alpha=0.3)

plt.subplot(1, 3, 2)
plt.stem(lasso.coef_, linefmt='r-', markerfmt='ro', basefmt=' ')
plt.title(f'Lasso Coefficients (L1)\nNon-zero: {np.sum(np.abs(lasso.coef_) > 1e-3)}')
plt.xlabel('Feature Index')
plt.ylabel('Coefficient Value')
plt.grid(alpha=0.3)

plt.subplot(1, 3, 3)
plt.scatter(ridge.coef_, lasso.coef_, alpha=0.6)
plt.xlabel('Ridge Coefficients')
plt.ylabel('Lasso Coefficients')
plt.title('Ridge vs Lasso')
plt.grid(alpha=0.3)
plt.axhline(0, color='k', linestyle='--', linewidth=0.5)
plt.axvline(0, color='k', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

print(f"Ridge: {np.sum(np.abs(ridge.coef_) > 1e-3)} non-zero coefficients")
print(f"Lasso: {np.sum(np.abs(lasso.coef_) > 1e-3)} non-zero coefficients (feature selection!)")

## 2. Cross-Validation for Hyperparameter Selection

Use k-fold cross-validation to select the regularization strength (alpha).

In [None]:
# Define alpha range
alphas = np.logspace(-4, 2, 50)

# Use RidgeCV and LassoCV (built-in CV)
ridge_cv = RidgeCV(alphas=alphas, cv=5)
lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000)

ridge_cv.fit(X_scaled, y)
lasso_cv.fit(X_scaled, y)

print(f"Ridge optimal alpha: {ridge_cv.alpha_:.4f}")
print(f"Lasso optimal alpha: {lasso_cv.alpha_:.4f}")

# Manual CV to visualize validation curves
ridge_scores = []
lasso_scores = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    lasso = Lasso(alpha=alpha, max_iter=10000)
    
    ridge_score = cross_val_score(ridge, X_scaled, y, cv=5, scoring='neg_mean_squared_error').mean()
    lasso_score = cross_val_score(lasso, X_scaled, y, cv=5, scoring='neg_mean_squared_error').mean()
    
    ridge_scores.append(-ridge_score)
    lasso_scores.append(-lasso_score)

# Plot validation curves
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.semilogx(alphas, ridge_scores, 'b-', linewidth=2, label='Ridge')
plt.axvline(ridge_cv.alpha_, color='b', linestyle='--', label=f'Optimal: {ridge_cv.alpha_:.4f}')
plt.xlabel('Alpha (regularization strength)')
plt.ylabel('CV Mean Squared Error')
plt.title('Ridge Validation Curve')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.semilogx(alphas, lasso_scores, 'r-', linewidth=2, label='Lasso')
plt.axvline(lasso_cv.alpha_, color='r', linestyle='--', label=f'Optimal: {lasso_cv.alpha_:.4f}')
plt.xlabel('Alpha (regularization strength)')
plt.ylabel('CV Mean Squared Error')
plt.title('Lasso Validation Curve')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Time-Series Cross-Validation

Implement walk-forward validation for time-series data.

In [None]:
# Generate synthetic time-series data
n_samples_ts = 200
time = np.linspace(0, 10, n_samples_ts)
trend = 0.5 * time
seasonal = 2 * np.sin(2 * np.pi * time / 2)
noise = np.random.randn(n_samples_ts) * 0.5
y_ts = trend + seasonal + noise

# Create lag features
def create_lag_features(y, n_lags=5):
    X = np.zeros((len(y) - n_lags, n_lags))
    for i in range(n_lags):
        X[:, i] = y[n_lags-i-1:len(y)-i-1]
    return X, y[n_lags:]

X_ts, y_ts_target = create_lag_features(y_ts, n_lags=5)

# Time series CV
tscv = TimeSeriesSplit(n_splits=5)

print("Time Series Cross-Validation Splits:")
for i, (train_idx, test_idx) in enumerate(tscv.split(X_ts)):
    print(f"Split {i+1}: Train size={len(train_idx)}, Test size={len(test_idx)}")

# Evaluate model with time-series CV
model = Ridge(alpha=1.0)
scores = cross_val_score(model, X_ts, y_ts_target, cv=tscv, scoring='neg_mean_squared_error')
print(f"\nTime-series CV MSE scores: {-scores}")
print(f"Mean CV MSE: {-scores.mean():.4f} +/- {scores.std():.4f}")

# Visualize splits
plt.figure(figsize=(12, 6))
for i, (train_idx, test_idx) in enumerate(tscv.split(X_ts)):
    plt.subplot(tscv.n_splits, 1, i+1)
    plt.plot(train_idx, np.ones(len(train_idx)) * i, 'b-', linewidth=10, label='Train' if i==0 else '')
    plt.plot(test_idx, np.ones(len(test_idx)) * i, 'r-', linewidth=10, label='Test' if i==0 else '')
    plt.ylabel(f'Split {i+1}')
    plt.yticks([])
    if i == 0:
        plt.legend(loc='upper right')
    if i < tscv.n_splits - 1:
        plt.xticks([])

plt.xlabel('Sample Index')
plt.suptitle('Time Series Cross-Validation Splits (Walk-Forward)', y=1.02)
plt.tight_layout()
plt.show()

## 4. Elastic Net: Combining L1 and L2

Explore Elastic Net which combines Ridge and Lasso regularization.

In [None]:
# Elastic Net: combines L1 and L2
# Loss = MSE + alpha * (l1_ratio * L1 + (1 - l1_ratio) * L2)

l1_ratios = [0.1, 0.5, 0.9]  # 0 = Ridge, 1 = Lasso
alpha = 0.1

plt.figure(figsize=(15, 4))
for i, l1_ratio in enumerate(l1_ratios):
    elastic = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10000)
    elastic.fit(X_scaled, y)
    
    plt.subplot(1, 3, i+1)
    plt.stem(elastic.coef_, linefmt='g-', markerfmt='go', basefmt=' ')
    plt.title(f'Elastic Net (l1_ratio={l1_ratio})\nNon-zero: {np.sum(np.abs(elastic.coef_) > 1e-3)}')
    plt.xlabel('Feature Index')
    plt.ylabel('Coefficient Value')
    plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## Exercises for Further Practice

1. **Implement Coordinate Descent**: Code Lasso using coordinate descent algorithm
2. **Regularization Path**: Plot the full regularization path showing how coefficients change with alpha
3. **Early Stopping**: Implement early stopping as a form of regularization
4. **Dropout Simulation**: Simulate dropout on linear models
5. **Real Financial Data**: Apply time-series CV to stock price prediction

## Deliverables Checklist

- [ ] Ridge vs Lasso comparison with feature selection demonstration
- [ ] Cross-validation experiments with validation curves
- [ ] Time-series CV implementation and visualization
- [ ] Notebook with clear recommendations for choosing regularization methods

## Recommended Resources

- Hastie, Tibshirani, Friedman: "Elements of Statistical Learning" (Regularization chapter)
- scikit-learn documentation: linear models and cross-validation
- "An Introduction to Statistical Learning" (ISLR) Chapter on Ridge/Lasso