# Underfitting und Overfitting

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
np.random.seed(5)

Wir definieren uns unsere "Ground Truth" Funktion:

In [None]:
def true_fun(X):
    return np.cos(1.5 * np.pi * X)

und plotten dagegen Modelle mit Polynom Regression mit unterschiedlichen Graden:

In [None]:
n_samples = 30
degrees = [1, 4, 15]

X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1

fig, axs = plt.subplots(1, 3, figsize=(30, 10))

def setup(ax):
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best", fontsize=18)

def plot_comparison(X, y):
    for i in range(len(degrees)):

        polynomial_features = PolynomialFeatures(degree=degrees[i],
                                                 include_bias=False)
        linear_regression = LinearRegression()
        pipeline = Pipeline([("polynomial_features", polynomial_features),
                             ("linear_regression", linear_regression)])
        pipeline.fit(X[:, np.newaxis], y)

        # Evaluate the models using crossvalidation
        scores = cross_val_score(pipeline, X[:, np.newaxis], y,
                                 scoring="neg_mean_squared_error", cv=10)

        X_test = np.linspace(0, 1, 100)
        axs[i].plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Modell")
        axs[i].plot(X_test, true_fun(X_test), label="Wahre Funktion")
        axs[i].scatter(X, y, edgecolor='b', s=20, label="Samples")

        setup(axs[i])
        axs[i].set_title(f"Grad {degrees[i]}\nMSE = {-scores.mean():.2e}(+/- {scores.std():.2e})", fontsize=18)

plot_comparison(X, y)

... und jetzt mit 10-mal so vielen Trainingsdaten:

In [None]:
n_samples = 300

X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1

fig, axs = plt.subplots(1, 3, figsize=(30, 10))

plot_comparison(X, y)


## Validation Curve

In [None]:
from sklearn.model_selection import learning_curve

param_range = np.arange(1,16)

fig, axs = plt.subplots(1,3, figsize=(30,10))
i = 0

n_samples = 2000

X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1

for degree in [2, 5, 6]:
    polynomial_features = PolynomialFeatures(degree=degree, include_bias=True)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)])

    # print(pipeline.get_params().keys())
    train_sizes_proz = np.linspace(0.001, 1, 100)
    train_sizes, train_scores, validation_scores = learning_curve(pipeline, X[:, np.newaxis], y, train_sizes = train_sizes_proz, cv = 5, scoring='neg_mean_squared_error', shuffle=True)

    train_scores_mean = -np.mean(train_scores, axis=1)
    validation_scores_mean = -np.mean(validation_scores, axis=1)

    axs[i].set_ylim([0, 0.5])
    axs[i].plot(train_sizes, train_scores_mean, label = 'Training error')
    axs[i].plot(train_sizes, validation_scores_mean, label = 'Validation error')
    axs[i].set_ylabel('MSE', fontsize = 14)
    axs[i].set_xlabel('Training set size', fontsize = 14)
    axs[i].set_title(f'Learning curves: Degree {degree}', fontsize = 24, y = 1.03)
    axs[i].legend();
    i += 1
