# Model Complexity

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.gridspec as gridspec
from ipywidgets import interactive

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.tree import plot_tree
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.model_selection import validation_curve

from sklearn.metrics import mean_squared_error

In [None]:
rng = np.random.RandomState(2)

## Read in datasets and split them

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df_synthetic = pd.read_csv(os.path.join(dataset_dir, "synthetic.csv"))

In [None]:
X_synthetic = df_synthetic[["feature"]].values
y_synthetic = df_synthetic["target"].values

In [None]:
X_train_synthetic, X_test_synthetic, y_train_synthetic, y_test_synthetic = \
    train_test_split(X_synthetic, y_synthetic, test_size=0.2, random_state=rng)

In [None]:
df_cs1109 = pd.read_csv(os.path.join(dataset_dir, "cs1109.csv"))

In [None]:
features = ["lect", "lab"]

X_cs1109 = df_cs1109[features]

label_encoder = LabelEncoder()
y_cs1109 = label_encoder.fit_transform(df_cs1109["outcome"])

In [None]:
X_train_cs1109, X_test_cs1109, y_train_cs1109, y_test_cs1109 = \
    train_test_split(X_cs1109, y_cs1109, test_size=0.2, stratify=df_cs1109["outcome"], random_state=rng)

In [None]:
df_housing = pd.read_csv(os.path.join(dataset_dir, "housing.csv"))

In [None]:
features = ["BasementArea", "GroundFloorArea", "Bedrooms", "Condition"]

X_housing = df_housing[features].values
y_housing = df_housing["SalePrice"].values

In [None]:
X_train_housing, X_test_housing, y_train_housing, y_test_housing = \
    train_test_split(X_housing, y_housing, test_size=0.2, random_state=rng)

## Polynomial Regression

We've seen this before. But here is a reminder.

In [None]:
linear_model = LinearRegression()

quadratic_model = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("predictor", LinearRegression())
])

cubic_model = Pipeline([
    ("poly", PolynomialFeatures(degree=3, include_bias=False)),
    ("predictor", LinearRegression())
])

degree_100_model = Pipeline([
    ("poly", PolynomialFeatures(degree=100, include_bias=False)),
    ("predictor", LinearRegression())
])

In [None]:
linear_model.fit(X_synthetic, y_synthetic)

quadratic_model.fit(X_synthetic, y_synthetic)

cubic_model.fit(X_synthetic, y_synthetic)

degree_100_model.fit(X_synthetic, y_synthetic)

## Underfitting and Overfitting

In [None]:
def plot_scatter_and_line(xs_scatter, ys_scatter, xs_line, ys_line):
    fig, ax = plt.subplots()
    sns.scatterplot(x=xs_scatter, y=ys_scatter, ax=ax)
    sns.lineplot(x=xs_line, y=ys_line, color='g', ax=ax)

In [None]:
xs_line = np.linspace(0, 1, 50)

In [None]:
plot_scatter_and_line(X_train_synthetic.flatten(), y_train_synthetic, xs_line, linear_model.predict(xs_line.reshape(50, 1)))

In [None]:
plot_scatter_and_line(X_train_synthetic.flatten(), y_train_synthetic, xs_line, quadratic_model.predict(xs_line.reshape(50, 1)))

In [None]:
plot_scatter_and_line(X_train_synthetic.flatten(), y_train_synthetic, xs_line, cubic_model.predict(xs_line.reshape(50, 1)))

In [None]:
plot_scatter_and_line(X_train_synthetic.flatten(), y_train_synthetic, xs_line, degree_100_model.predict(xs_line.reshape(50, 1)))

In [None]:
decision_tree_depth_1 = DecisionTreeClassifier(max_depth=1, random_state=rng)

decision_tree_depth_2 = DecisionTreeClassifier(max_depth=2, random_state=rng)

decision_tree_depth_3 = DecisionTreeClassifier(max_depth=3, random_state=rng)

decision_tree_depth_4 = DecisionTreeClassifier(max_depth=4, random_state=rng)

decision_tree_no_max_depth = DecisionTreeClassifier(max_depth=None, random_state=rng)

In [None]:
decision_tree_depth_1.fit(X_train_cs1109, y_train_cs1109)

decision_tree_depth_2.fit(X_train_cs1109, y_train_cs1109)

decision_tree_depth_3.fit(X_train_cs1109, y_train_cs1109)

decision_tree_depth_4.fit(X_train_cs1109, y_train_cs1109)

decision_tree_no_max_depth.fit(X_train_cs1109, y_train_cs1109)

In [None]:
def plot_scatter_and_boundaries(decision_tree):
    fig, ax = plt.subplots()
    DecisionBoundaryDisplay.from_estimator(
        decision_tree, X_train_cs1109, cmap=plt.cm.RdBu, response_method="predict", xlabel="lect", ylabel="lab", ax=ax)
    sns.scatterplot(data=X_train_cs1109, x="lect", y="lab", hue=y_train_cs1109, style=y_train_cs1109, ax=ax)
    plt.show()

In [None]:
plot_scatter_and_boundaries(decision_tree_depth_1)

In [None]:
plot_scatter_and_boundaries(decision_tree_depth_2)

In [None]:
plot_scatter_and_boundaries(decision_tree_depth_3)

In [None]:
plot_scatter_and_boundaries(decision_tree_depth_4)

In [None]:
plot_scatter_and_boundaries(decision_tree_no_max_depth)

## Validation curves - plotting validation error against model complexity

In [None]:
def plot_validation_curve(x_range, train_errors, val_errors, invert_axis=False):
    fig = plt.figure()
    ax = plt.axes()
    plt.xlabel("complexity")
    if invert_axis:
        ax.invert_xaxis()
    plt.ylabel("error")
    plt.plot(x_range, train_errors, label = "training error", color = "red")
    plt.plot(x_range, val_errors, label="validation error", color = "gold")
    plt.legend()

In [None]:
degrees = np.arange(1, 30)

poly_model = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("predictor", LinearRegression())
])

cv_train_errors, cv_val_errors = validation_curve(
    poly_model, X_train_synthetic, y_train_synthetic, param_name="poly__degree", param_range=degrees, cv=10, scoring="neg_mean_absolute_error")

train_errors = np.mean(np.abs(cv_train_errors), axis=1)
val_errors = np.mean(np.abs(cv_val_errors), axis=1)

plot_validation_curve(x_range=degrees, train_errors=train_errors, val_errors=val_errors)
plt.show()

In [None]:
depths = np.arange(1, 5)

decision_tree = DecisionTreeClassifier()

cv_train_errors, cv_val_errors = validation_curve(
    decision_tree, X_train_cs1109, y_train_cs1109, param_name="max_depth", param_range=depths, cv=10, scoring="accuracy")

train_errors = np.mean(np.abs(1 - cv_train_errors), axis=1)
val_errors = np.mean(np.abs(1 - cv_val_errors), axis=1)

plot_validation_curve(x_range=depths, train_errors=train_errors, val_errors=val_errors)
plt.show()

In [None]:
k_vals = np.arange(1, 50)

knn = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", KNeighborsRegressor())
]) 

cv_train_errors, cv_val_errors = validation_curve(
    knn, X_train_housing, y_train_housing, param_name="predictor__n_neighbors", param_range=k_vals, cv=10, scoring="neg_mean_absolute_error")

train_errors = np.mean(np.abs(cv_train_errors), axis=1)
val_errors = np.mean(np.abs(cv_val_errors), axis=1)

plot_validation_curve(x_range=k_vals, train_errors=train_errors, val_errors=val_errors, invert_axis=True)
plt.show()

## Diagnosis: Is a model underfitting or overfitting?

We know that:
- Underfitting: the validation error is high and even the training error is high.
- Overfitting: the training error is low but the validation error is high.

So if we compute and compare training error and validation error, we can know whether our model is underfitting or overfiting and then take appropriate action. Here's a handy function for giving us the training error and validation error:

In [None]:
def check_fit(model, X_train, y_train, cv, metric):
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=metric, return_train_score=True, n_jobs=-1)
    return scores["train_score"].mean(), scores["test_score"].mean()

So, now, tell me whether the following models are underfitting or overfitting:

In [None]:
train_err, val_err = check_fit(model=linear_model, 
            X_train=X_train_synthetic, y_train=y_train_synthetic, 
            cv=10, metric="neg_mean_absolute_error")

train_err, val_err

In [None]:
train_err, val_err = check_fit(model=quadratic_model, 
            X_train=X_train_synthetic, y_train=y_train_synthetic, 
            cv=10, metric="neg_mean_absolute_error")

train_err, val_err

In [None]:
train_err, val_err = check_fit(model=cubic_model, 
            X_train=X_train_synthetic, y_train=y_train_synthetic, 
            cv=10, metric="neg_mean_absolute_error")

train_err, val_err

In [None]:
train_err, val_err = check_fit(model=degree_100_model, 
            X_train=X_train_synthetic, y_train=y_train_synthetic, 
            cv=10, metric="neg_mean_absolute_error")

train_err, val_err

In [None]:
train_acc, val_acc = check_fit(model=decision_tree_depth_1, 
            X_train=X_train_cs1109, y_train=y_train_cs1109, 
            cv=10, metric="accuracy")

train_acc, val_acc

In [None]:
train_acc, val_acc = check_fit(model=decision_tree_depth_2, 
            X_train=X_train_cs1109, y_train=y_train_cs1109, 
            cv=10, metric="accuracy")

train_acc, val_acc

In [None]:
train_acc, val_acc = check_fit(model=decision_tree_depth_3, 
            X_train=X_train_cs1109, y_train=y_train_cs1109, 
            cv=10, metric="accuracy")

train_acc, val_acc

In [None]:
train_acc, val_acc = check_fit(model=decision_tree_depth_4, 
            X_train=X_train_cs1109, y_train=y_train_cs1109, 
            cv=10, metric="accuracy")

train_acc, val_acc

In [None]:
train_acc, val_acc = check_fit(model=decision_tree_no_max_depth, 
            X_train=X_train_cs1109, y_train=y_train_cs1109, 
            cv=10, metric="accuracy")

train_acc, val_acc

## Regularization

We'll fit an unregularized linear model and two regularized models (Lasso and Ridge) to the synthetic data.

In [None]:
ols = LinearRegression()
ols.fit(X_train_synthetic, y_train_synthetic)
y_predicted_ols = ols.predict(X_test_synthetic)
mse_ols = mean_squared_error(y_predicted_ols, y_test_synthetic)

lasso = Lasso(alpha=1.0)
lasso.fit(X_train_synthetic, y_train_synthetic)
y_predicted_lasso = lasso.predict(X_test_synthetic)
mse_lasso = mean_squared_error(y_predicted_lasso, y_test_synthetic)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_synthetic, y_train_synthetic)
y_predicted_ridge = ridge.predict(X_test_synthetic)
mse_ridge = mean_squared_error(y_predicted_ridge, y_test_synthetic)

In [None]:
fig = plt.figure(figsize=(14, 4.5)) 
gs = gridspec.GridSpec(1, 3) 
# Leftmost diagram: OLS
ax0 = plt.subplot(gs[0])
plt.title("OLS Linear Regression\nMSE: %.3f\nIntercept: %.3f\nCoefficient: %.3f" % 
          (mse_ols, ols.intercept_, ols.coef_[0]))
plt.xlabel("Feature")
plt.ylabel("MSE")
plt.ylim(-4, 14)
ax0.scatter(X_train_synthetic, y_train_synthetic, color = "green")
ax0.plot(X_test_synthetic, y_predicted_ols, color = "blue")
# Middle diagram: Lasso
ax1 = plt.subplot(gs[1])
plt.title("Lasso Regression\nMSE: %.3f\nIntercept: %.3f\nCoefficient: %.3f" % 
          (mse_lasso, lasso.intercept_, lasso.coef_[0]))
plt.xlabel("Feature")
plt.ylabel("MSE")
plt.ylim(-4, 14)
ax1.scatter(X_train_synthetic, y_train_synthetic, color = "green")
ax1.plot(X_test_synthetic, y_predicted_lasso, color = "blue")
# Righmost diagram: Ridge
ax2 = plt.subplot(gs[2])
plt.title("Ridge Regression\nMSE: %.3f\nIntercept: %.3f\nCoefficient: %.3f" % 
          (mse_ridge, ridge.intercept_, ridge.coef_[0]))
plt.xlabel("Feature")
plt.ylabel("MSE")
plt.ylim(-4, 14)
ax2.scatter(X_train_synthetic, y_train_synthetic, color = "green")
ax2.plot(X_test_synthetic, y_predicted_ridge, color = "blue")

fig.tight_layout()
plt.show()

Here is an interactive version so that we can play with the regularization hyperparameter. (The slides refer to it as lambda, but scikit-learn calls it alpha).

In [None]:
def plot_model(model, alpha):
    plt.figure()
    plt.title("%s with lambda (alpha) = %.1f" % (model, alpha))
    plt.xlabel("Feature")
    plt.ylabel("MSE")
    plt.ylim(-4, 14)
    plt.scatter(X_train_synthetic, y_train_synthetic, color = "green")
    if model == "lasso":
        model = Lasso(alpha)
    else:
        model = Ridge(alpha)
    model.fit(X_train_synthetic, y_train_synthetic)
    y_predicted = model.predict(X_test_synthetic)
    plt.plot(X_test_synthetic, y_predicted, color = "blue")
    plt.show()
    
interactive_plot = interactive(plot_model, {'manual': True}, scale=True, alpha=(0,3,.1), model=["lasso", "ridge"]) 
interactive_plot

Regularization is a response to overfitting. The problem with the example above is that we are regularizing a model (linear regression) on a dataset that it underfits!

To see the value of regularization, let's regularize a model that does overfit. Let's regularize Polynomial Regression with degree 100.

In [None]:
def plot_model(model, alpha):
    plt.figure()
    plt.title("%s with lambda (alpha) = %.1f" % (model, alpha))
    plt.xlabel("Feature")
    plt.ylabel("MSE")
    plt.ylim(-4, 14)
    plt.scatter(X_train_synthetic, y_train_synthetic, color = "green")
    if model == "lasso":
        model = Pipeline([("poly", PolynomialFeatures(degree=100, include_bias=False)),
                          ("predictor", Lasso(alpha))])
    else:
        model = Pipeline([("poly", PolynomialFeatures(degree=100, include_bias=False)),
                          ("predictor", Ridge(alpha))])
    model.fit(X_train_synthetic, y_train_synthetic)
    y_predicted = model.predict(X_test_synthetic)
    test_sorted = sorted(zip(X_test_synthetic, y_predicted))
    plt.plot([x for x, _ in test_sorted], [y_predicted for _, y_predicted in test_sorted], color = "blue")
    plt.show()
    
interactive_plot = interactive(plot_model, {'manual': True}, scale=True, alpha=(0,3,.1), model=["lasso", "ridge"]) 
interactive_plot