In [None]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import compose, datasets, linear_model, metrics, model_selection
from sklearn import preprocessing, pipeline

# Preprocessing with Scikit-Learn

In [None]:
features, targets = datasets.load_diabetes(
    return_X_y=True,
    as_frame=True,
    scaled=False
)

In [None]:
features.head()

In [None]:
features.info()

In [None]:
targets.describe()

## Train-test split

In [None]:
prng = np.random.RandomState(42)

train_features, test_features, train_targets, test_targets = (
    model_selection.train_test_split(
        features,
        targets,
        random_state=prng,
        test_size=0.1
    )
)

## Feature Preprocessing

In [None]:
transformer_0 = compose.make_column_transformer(
    (
        preprocessing.OneHotEncoder(
            drop="first",
            dtype=np.uint8,
            sparse_output=False,
        ),
        ["sex"]
    ),
    remainder="drop",
    verbose=True,
    verbose_feature_names_out=False
)


transformer_1 = compose.make_column_transformer(
    (
        preprocessing.StandardScaler(),
        ["age", "bmi", "bp", "s1", "s2", "s3", "s4", "s5"]
    ),
    remainder="drop",
    verbose=True,
    verbose_feature_names_out=False
)

features_preprocessor = pipeline.make_union(
    transformer_0,
    transformer_1,
    verbose=True,
    n_jobs=-1
).set_output(transform="pandas")

In [None]:
features_preprocessor

In [None]:
features_preprocessor.fit_transform(train_features)

In [None]:
target_preprocessor = preprocessing.FunctionTransformer(
    func=np.log,
    inverse_func=np.exp
)

In [None]:
target_preprocessor

In [None]:
target_preprocessor.fit_transform(train_targets)

## Model training

### Using LinearRegression

In [None]:
_regressor = compose.TransformedTargetRegressor(
    regressor=linear_model.LinearRegression(),
    transformer=target_preprocessor
)

linear_regression_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    _regressor
)

In [None]:
%%timeit
_ = linear_regression_pipeline.fit(train_features, train_targets)

In [None]:
_ = linear_regression_pipeline.fit(train_features, train_targets)

In [None]:
train_predictions = linear_regression_pipeline.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"Training rmse: {train_rmse}")

In [None]:
test_predictions = linear_regression_pipeline.predict(test_features)
test_rmse = metrics.mean_squared_error(
    test_targets,
    test_predictions,
    squared=False
)
print(f"Testing rmse: {test_rmse}")

### Using SGDRegressor

In [None]:
_regressor = compose.TransformedTargetRegressor(
    regressor=linear_model.SGDRegressor(),
    transformer=target_preprocessor
)

sgd_regressor_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    _regressor
)

In [None]:
sgd_regressor_pipeline

In [None]:
%%timeit
_ = sgd_regressor_pipeline.fit(train_features, train_targets)

In [None]:
_ = sgd_regressor_pipeline.fit(train_features, train_targets)

In [None]:
train_predictions = sgd_regressor_pipeline.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"Training rmse: {train_rmse}")

In [None]:
test_predictions = sgd_regressor_pipeline.predict(test_features)
test_rmse = metrics.mean_squared_error(
    test_targets,
    test_predictions,
    squared=False
)
print(f"Testing rmse: {test_rmse}")

### Exercise

Compare the training loss and the testing loss. Is the model underfitting or overfitting? How can you tell?