In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
n_samples = 30

# Let's create the "true" function
def true_fun(X):
    return np.cos(1.5 * np.pi * X)

In [None]:
# We generate some fake data that's close to the actual function
X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1

In [None]:
# Plot the fake data
plt.scatter(X, y)
# Plot the actual function
X_true = np.linspace(0, 1, 100)
plt.plot(X_true, true_fun(X_true), label="True function")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures 
#http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [None]:
def train_features(degrees):
    # Limit the plot to [0, 1] and [-2, 2]
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    
    # Plot the true function
    plt.plot(X_true, true_fun(X_true))
    # Plot the mock data
    plt.scatter(X, y)
    
    # Create a training pipeline with polynomial features and linear regression
    pl = PolynomialFeatures(degree=degrees, include_bias=False)
    lm = LinearRegression()
    pipeline = Pipeline([("pl", pl), ("lm", lm)])
    pipeline.fit(X[:, None], y)
    # Print the score
    print("Score: " + str(pipeline.score(X[:, None], y)))
    
    X_predict = np.linspace(0, 1, 100)
    plt.plot(X_predict, pipeline.predict(X_predict[:, None]))

In [None]:
train_features(1)

In [None]:
train_features(5)

In [None]:
train_features(17)

In [None]:
# Cross validation scoring
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
def train_features(degrees):
    # Limit the plot to [0, 1] and [-2, 2]
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    
    # Plot the true function
    plt.plot(X_true, true_fun(X_true))
    # Plot the mock data
    plt.scatter(X, y)
    
    folds = KFold(n_splits=10)
    
    # For each fold ...
    for train_indices, test_indices in folds.split(X, y): 
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        
        # Create a training pipeline with polynomial features and linear regression
        pl = PolynomialFeatures(degree=degrees, include_bias=False)
        lm = LinearRegression()
        pipeline = Pipeline([("pl", pl), ("lm", lm)])
        # Fit the pipeline with the TRAINING data
        pipeline.fit(X_train[:, None], y_train)
    
        # Print the scores
        print("Score: " + str(pipeline.score(X_test[:, None], y_test)))
    
        X_predict = np.linspace(0, 1, 100)
        plt.plot(X, pipeline.predict(X[:, None]))

In [None]:
train_features(10)

In [None]:
train_features(16)

In [None]:
train_features(100)

In [None]:
# Last tip: do this using cross_val_score
from sklearn.model_selection import cross_val_score

def cross_score(degrees):
    # Create a training pipeline with polynomial features and linear regression
    pl = PolynomialFeatures(degree=degrees, include_bias=False)
    lm = LinearRegression()
    pipeline = Pipeline([("pl", pl), ("lm", lm)])
    print(cross_val_score(pipeline, X[:, None], y, cv=10))
cross_score(8)