In [None]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import compose, datasets, linear_model, metrics, model_selection
from sklearn import preprocessing, pipeline

# Regularized Linear Models

In [None]:
features, targets = datasets.load_diabetes(
    return_X_y=True,
    as_frame=True,
    scaled=False
)

### Train-test split

In [None]:
prng = np.random.RandomState(42)

train_features, test_features, train_targets, test_targets = (
    model_selection.train_test_split(
        features,
        targets,
        random_state=prng,
        test_size=0.1
    )
)

## Feature Preprocessing

In [None]:
transformer_1 = compose.make_column_transformer(
    (
        preprocessing.OneHotEncoder(
            drop="first",
            dtype=np.uint8,
            sparse_output=False,
        ),
        ["sex"]
    ),
    remainder="drop",
    verbose=True,
    verbose_feature_names_out=False
)


transformer_2 = compose.make_column_transformer(
    (
        preprocessing.StandardScaler(),
        ["age", "bmi", "bp", "s1", "s2", "s3", "s4", "s5"]
    ),
    remainder="drop",
    verbose=True,
    verbose_feature_names_out=False
)

features_preprocessor = pipeline.make_union(
    transformer_1,
    transformer_2,
    verbose=True,
    n_jobs=-1
).set_output(transform="pandas")

In [None]:
features_preprocessor

## Target Preprocessing

In [None]:
target_preprocessor = preprocessing.FunctionTransformer(
    func=np.log,
    inverse_func=np.exp
)

In [None]:
target_preprocessor

## Feature Engineering

In [None]:
feature_engineering = preprocessing.PolynomialFeatures(
    degree=2,
    include_bias=False,
    interaction_only=False
).set_output(transform="pandas")

In [None]:
feature_engineering

## Model training

### Using ElasticNet

In [None]:
_regressor = compose.TransformedTargetRegressor(
    linear_model.ElasticNet(
        alpha=1e-3,
        l1_ratio=0.5,
        max_iter=4096,
        fit_intercept=True,
        random_state=prng,
    ),
    transformer=target_preprocessor
)

elastic_net_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    _regressor
)

In [None]:
_ = elastic_net_pipeline.fit(train_features, train_targets)

In [None]:
train_predictions = elastic_net_pipeline.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"Training rmse: {train_rmse}")

### Using SGDRegressor

In [None]:
linear_model.SGDRegressor?

In [None]:
_regressor = compose.TransformedTargetRegressor(
    linear_model.SGDRegressor(
        penalty="elasticnet",
        alpha=1e-3,
        l1_ratio=0.5,
        fit_intercept=True,
    ),
    transformer=target_preprocessor
)

sgd_regressor_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    _regressor
)

In [None]:
_ = sgd_regressor_pipeline.fit(train_features, train_targets)

In [None]:
train_predictions = sgd_regressor_pipeline.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"Training rmse: {train_rmse}")

## Model assessment

In [None]:
cv_neg_mses = model_selection.cross_val_score(
    elastic_net_pipeline,
    train_features,
    train_targets,
    cv=5,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    verbose=1
)

In [None]:
cv_rmse = np.sqrt(np.mean(-cv_neg_mses))
print(f"ElasticNet CV rmse: {cv_rmse}")

In [None]:
cv_neg_mses = model_selection.cross_val_score(
    sgd_regressor_pipeline,
    train_features,
    train_targets,
    cv=5,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    verbose=1
)

In [None]:
cv_rmse = np.sqrt(np.mean(-cv_neg_mses))
print(f"SGDRegressor CV rmse: {cv_rmse}")

### Exercise:

Is our current model under-fitting or over-fitting? How can you tell? What can you do to fix the problem?

## Tuning model performance

### Using ElasticNetCV

[ElasticNetCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html) is an example of a [cross-validation estimator](https://scikit-learn.org/stable/glossary.html#term-cross-validation-estimator). Cross-validation estimators are named `EstimatorCV` and tend to be roughly equivalent to `GridSearchCV(Estimator(), ...)`. The advantage of using a cross-validation estimator over the canonical estimator class along with grid search is that they can take advantage of warm-starting by reusing precomputed results in the previous steps of the cross-validation process.

When calling `fit`, once the best parameters `l1_ratio` and `alpha` are found through cross-validation, the model is `fit` again using the entire training set.

In [None]:
linear_model.ElasticNetCV?

In [None]:
regressor_cv = compose.TransformedTargetRegressor(
    linear_model.ElasticNetCV(
        cv=5,
        eps=1e-3,
        fit_intercept=True,
        l1_ratio=np.logspace(-1, 0, 10),
        max_iter=8192,
        alphas=np.logspace(-4, 0, 10),
        n_jobs=-1,
        random_state=prng,
        selection="random",
        verbose=0,
    ),
    transformer=target_preprocessor
)

tuned_elastic_net_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    regressor_cv,
    verbose=True
)

In [None]:
tuned_elastic_net_pipeline

In [None]:
_ = tuned_elastic_net_pipeline.fit(train_features, train_targets)

In [None]:
(
    tuned_elastic_net_pipeline.named_steps["transformedtargetregressor"]
                              .regressor_
                              .alpha_
)

In [None]:
(
    tuned_elastic_net_pipeline.named_steps["transformedtargetregressor"]
                              .regressor_
                              .l1_ratio_
)

### Using GridSearchCV and SGDRegressor

In [None]:
tuned_sgd_regressor_pipeline = model_selection.GridSearchCV(
    sgd_regressor_pipeline,
    cv=5,
    param_grid={
        "transformedtargetregressor__regressor__alpha": np.logspace(-4, 0, 10),
        "transformedtargetregressor__regressor__l1_ratio": np.logspace(-1, 0, 10)
    },
    n_jobs=-1,
    refit=True,
    scoring="neg_mean_squared_error",
    verbose=1
)

In [None]:
tuned_sgd_regressor_pipeline

In [None]:
_ = tuned_sgd_regressor_pipeline.fit(train_features, train_targets)

In [None]:
tuned_sgd_regressor_pipeline.best_params_

## Assessing performance of the tuned model

In [None]:
train_predictions = tuned_elastic_net_pipeline.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"ElasticNetCV training rmse: {train_rmse}")

In [None]:
test_predictions = tuned_elastic_net_pipeline.predict(test_features)
test_rmse = metrics.mean_squared_error(
    test_targets,
    test_predictions,
    squared=False
)
print(f"ElasticNetCV testing rmse: {test_rmse}")

In [None]:
train_predictions = tuned_sgd_regressor_pipeline.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"GridSearchCV + SGDRegressor training rmse: {train_rmse}")

In [None]:
test_predictions = tuned_sgd_regressor_pipeline.predict(test_features)
test_rmse = metrics.mean_squared_error(
    test_targets,
    test_predictions,
    squared=False
)
print(f"GridSearchCV + SGDRegressor Testing rmse: {test_rmse}")

### Exercise

Which of the two models should you prefer? Why?