In [None]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import compose, datasets, linear_model, metrics, model_selection
from sklearn import preprocessing, pipeline

# Linear Regression with Scikit-Learn

In [None]:
features, targets = datasets.load_diabetes(
    return_X_y=True,
    as_frame=True,
    scaled=True
)

In [None]:
features.head()

In [None]:
features.info()

In [None]:
targets.describe()

## Train-test split

In [None]:
prng = np.random.RandomState(42)

train_features, test_features, train_targets, test_targets = (
    model_selection.train_test_split(
        features,
        targets,
        random_state=prng,
        test_size=0.1
    )
)

## Model training

### Using LinearRegression

In [None]:
linear_regression = linear_model.LinearRegression()

In [None]:
linear_regression

In [None]:
%%timeit
_ = linear_regression.fit(train_features, train_targets)

In [None]:
_ = linear_regression.fit(train_features, train_targets)

In [None]:
train_predictions = linear_regression.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"Training rmse: {train_rmse}")

In [None]:
test_predictions = linear_regression.predict(test_features)
test_rmse = metrics.mean_squared_error(
    test_targets,
    test_predictions,
    squared=False
)
print(f"Testing rmse: {test_rmse}")

### Using SGDRegressor

In [None]:
sgd_regressor = linear_model.SGDRegressor()

In [None]:
%%timeit
_ = sgd_regressor.fit(train_features, train_targets)

In [None]:
_ = sgd_regressor.fit(train_features, train_targets)

In [None]:
train_predictions = sgd_regressor.predict(train_features)
train_rmse = metrics.mean_squared_error(
    train_targets,
    train_predictions,
    squared=False
)
print(f"Training rmse: {train_rmse}")

In [None]:
test_predictions = sgd_regressor.predict(test_features)
test_rmse = metrics.mean_squared_error(
    test_targets,
    test_predictions,
    squared=False
)
print(f"Testing rmse: {test_rmse}")

### Exercise

Compare the training loss and the testing loss. Is the model underfitting or overfitting? How can you tell?