# Underfitting, Overfitting and Regularization

In the previous notebook, we have seen that some model parameters have a
strong impact on the cross-validated accuracy of a predictive model.

The goal of this notebook is to deepen our understanding of this effect by
identifying when a model is too constrained to properly fit the variations
of the train dataset or too flexible leading the model to memorize the
the training data points individually (including noise) without having to
make any "effort" at capturing the repeatable structure that would make
its decision function to generalize correctly to unseen test points.

This notebook shows:
* how to tell if a trained predictive model is overfitting or underfitting (or both);
* how to evaluate the impact of regularization hyperparameters on generalization;
* how to evaluate the impact of the training set size on overfitting.

In [None]:
import pandas as pd

df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/adult-census.csv")
# Or use the local copy:
# df = pd.read_csv('../datasets/adult-census.csv')

In [None]:
target_name = "class"
target = df[target_name].to_numpy()
target

In [None]:
data = df.drop(columns=[target_name, "fnlwgt", "education-num"])

Once the dataset is loaded, we split it into a training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test, target_train, target_test = train_test_split(
    data, target, random_state=0, train_size=5000, test_size=5000,
)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer

preprocessor = ColumnTransformer([
    ('binary', OneHotEncoder(drop="first"),
     ['sex']),
    ('categorical', OneHotEncoder(handle_unknown='ignore'),
     ['workclass', 'education', 'marital-status',
      'occupation', 'relationship', 'race', 'native-country']),
    ('numeric', PowerTransformer(),
     ['age', 'capital-gain', 'capital-loss', 'hours-per-week']),
])

In [None]:
preprocessor.fit(df_train)
df_preprocessed_train = preprocessor.transform(df_train)
df_preprocessed_test = preprocessor.transform(df_test)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import validation_curve
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

knn_model = make_pipeline(
    TruncatedSVD(n_components=10),
    KNeighborsClassifier()
)
param_name = "kneighborsclassifier__n_neighbors"
param_range = [1, 5, 10, 50, 100, 500]
train_scores, test_scores = validation_curve(
    knn_model,
    df_preprocessed_train, target_train,
    param_name=param_name,
    param_range=param_range,
    cv=5, scoring="accuracy", n_jobs=2)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with k-Nearest Neighbors")
plt.xlabel("n_neighbors")
plt.ylabel("Score")
plt.ylim(0.78, 1.)
plt.semilogx(param_range, train_scores_mean, label="Training score")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2);

In [None]:
from sklearn.linear_model import LogisticRegression

param_name = "C"
param_range = [0.01, 0.1, 1, 10, 100]
train_scores, test_scores = validation_curve(
    LogisticRegression(),
    df_preprocessed_train, target_train,
    param_name="C",
    param_range=param_range,
    cv=5, scoring="accuracy", n_jobs=2)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with Logistic Regression")
plt.xlabel("C")
plt.ylabel("Score")
plt.ylim(0.8, 1.)
plt.semilogx(param_range, train_scores_mean, label="Training score")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2)
plt.legend();

In [None]:
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1)
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1)
    plt.plot(train_sizes, test_scores_mean, label="Cross-validation score")

    plt.legend(loc="best")
    return plt


plot_learning_curve(LogisticRegression(solver="lbfgs", max_iter=1000, C=1),
                    "Learning Curve for Logistic Regression",
                    df_preprocessed_train, target_train,
                    ylim=(0.8, 1.), n_jobs=2,
                    cv=StratifiedShuffleSplit(n_splits=20, test_size=0.2));