# Chapter 4 & 6
### Preprocessing, Pipelines, and Hyperparameter tuning

In [1]:
# dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [3]:
from util import util

In [8]:
# datasets
from sklearn.datasets import load_iris

# preprocessing utilities
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import FunctionTransformer

# feature selection
from sklearn.feature_selection import SelectKBest

# model selection
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.model_selection import learning_curve, validation_curve

# pipeline
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import ColumnTransformer

# metrics
from sklearn.metrics import auc, accuracy_score, roc_curve, precision_score, recall_score, f1_score

# estimators
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

Steps:
--- preprocessing ---
+ Removing nulls
+ Imputing missing values
+ Encoding ordinal variables
+ Encoding nominal variables
+ Scaling variables
+ Feature selection
+ Feature extraction/engineering
+ Imbalanced dataset correction(s)

Steps:
--- model selection ---
+ Classifier choice
+ Hyperparameter tuning (Grid Search CV, Nested K Fold)
+ Learning curve evaluation
+ Validation curve evaluation
+ ROC curve / Precision Recall curve evaluation - decision function choice


In [5]:
def prep_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    
    sc = StandardScaler().fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    return X_train_std, X_test_std, y_train, y_test

In [6]:
# preparing dataset
X, y = load_iris(return_X_y=True)
X_train_std, X_test_std, y_train, y_test = prep_data(X, y)
X_train_std_cut, X_test_std_cut, y_train_cut, y_test_cut = prep_data(X[:, [2, 3]], y)

X_std_combo = np.vstack((X_train_std, X_test_std))
X_std_combo_cut = np.vstack((X_train_std_cut, X_test_std_cut))
y_combo = np.hstack((y_train, y_test))
y_combo_cut = np.hstack((y_train_cut, y_test_cut))

## Additional Reading

+ https://scikit-learn.org/stable/modules/compose.html#featureunion-composite-feature-spaces
+ https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py
+ https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html#sphx-glr-auto-examples-compose-plot-feature-union-py
+ https://stackoverflow.com/questions/36113686/multiple-pipelines-that-merge-within-a-sklearn-pipeline
+ https://towardsdatascience.com/using-functiontransformer-and-pipeline-in-sklearn-to-predict-chardonnay-ratings-9b13fdd6c6fd
+ https://scikit-learn.org/stable/auto_examples/preprocessing/plot_function_transformer.html
+ https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html#sphx-glr-auto-examples-compose-plot-transformed-target-py


