<div style="padding:30px 0px;">
    <h1 align="center" style="padding:50px">Hyperparameter Tuning With Pipelines</h1>
    <p align="center" style="font-size:small;">Seth Pruitt<br>spruitt@norstal.com<br>www.github.com/faradical</p>
</div>

In [None]:
# Import Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

## Generating Synthetic Data

In [None]:
# Generate a random regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=100)

# Split the data into traing and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pd.DataFrame(X_train)

## Defining Pipeline Step Functions

## Hyperparameter Tuning

### Method 1 - Grid Search Inside a Pipe

In [None]:
# Define a dictionary of hyperparameters to search over
param_grid = {
    'copy_X': [True, False],
    'fit_intercept': [True, False]
}

# Perform a grid search with cross-validation
grid = GridSearchCV(LinearRegression(), param_grid, cv=5)

# Defining our pipeline and steps
p1 = Pipeline([
    ("Scaler", StandardScaler()),
    ("Linear Regression", grid)
])

# Fit and score the pipeline
p1.fit(X_train, y_train)
p1.score(X_test, y_test)

### Method 2 - Pipe Inside a Grid Search

In [None]:
# Defining our pipeline and steps
p2 = Pipeline([
    ("Scaler", StandardScaler()),
    ("Linear Regression", LinearRegression())
])

# Define a dictionary of hyperparameters to search over
param_grid = {
    'Linear Regression__copy_X': [True, False],
    'Linear Regression__fit_intercept': [True, False]
}

# Perform a grid search with cross-validation
grid = GridSearchCV(p2, param_grid, cv=5)

# Fit and score the pipeline
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

## Tuning the Pipeline

### Generating More Synthetic Data

In [None]:
import numpy as np
import random as rn

def y_f(x):
    return (0.5 * (x**2)) + (2 * x) + 6

X = np.arange(222)
y = [y_f(x)+rn.randint(-2000,2000) for x in X]

# Plotting the dummy data
plt.figure(facecolor='gray', figsize=(15,10)).set_alpha(0.0)
ax = plt.axes()
ax.set_facecolor("gray")
ax.set_alpha(0.0)

plt.scatter(X, y, c='red', marker="o")
plt.show()

In [None]:
# Split the data into traing and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Building a New Pipeline

In [None]:
# Import PolynomialFeatures to help fit the linear model to the curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer

# Creating a reshape function
def reshape(X):
    return X.reshape(-1, 1)

# Defining a new pipeline
steps = [
    ("Reshape", FunctionTransformer(reshape)),
    ("Scaler", StandardScaler()),
    ("PolynomialFeatures", PolynomialFeatures()),
    ("Linear Regression", LinearRegression())
]
p3 = Pipeline(steps)

### Executing a Grid Search Over the New Pipeline

In [None]:
# Define a dictionary of hyperparameters for both the model and pipeline steps.
param_grid = {
    'Scaler__with_mean': [True, False],
    'Scaler__with_std': [True, False],
    'PolynomialFeatures__degree': np.arange(1,4),
    'PolynomialFeatures__interaction_only': [True, False],
    'PolynomialFeatures__include_bias': [True, False],
    'PolynomialFeatures__order': ['C', 'F'],
    'Linear Regression__copy_X': [True, False],
    'Linear Regression__fit_intercept': [True, False]
}

# Perform a grid search with cross-validation
grid = GridSearchCV(p3, param_grid, cv=5)

# Fit and score the pipeline
grid.fit(X_train, y_train)
grid.score(X_test, y_test)