# Pipelines

* A fixed sequence of steps in processing the data, for example feature selection, normalization and classification

* Pipelines are very common in Machine Learning systems, since there is a lot of data to manipulate and many data transformations to apply, in sequencial order.

* Scikit-Learn provides the Pipeline class to help with such sequences of transformations. 

* Let's revisit the Exploratory Data Analysis and apply data preparation steps in a pipeline


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
import jedi
import nbextensions

import numpy as np
import pandas as pd

import random

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
sns.set(rc={"figure.figsize": (12, 8)})
%load_ext line_profiler
from sklearn.utils import resample

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [3]:
# Load the data
df = pd.read_csv('http://www.chalendony.de/application_train.csv')

In [3]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Remove the identifier column
df.drop(['SK_ID_CURR'], inplace=True, axis=1)

## Assign data types for  numerical and categorical attributes

In [6]:
categorical_features = ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE', \
                        'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','FLAG_MOBIL','FLAG_EMP_PHONE',\
                        'FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL','OCCUPATION_TYPE','CNT_FAM_MEMBERS',\
                        'REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY','WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START',\
                        'REG_REGION_NOT_LIVE_REGION','REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION','REG_CITY_NOT_LIVE_CITY',\
                        'REG_CITY_NOT_WORK_CITY','LIVE_CITY_NOT_WORK_CITY','ORGANIZATION_TYPE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3',\
                        'FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9',\
                        'FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15',\
                        'FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21',\
                        'EMERGENCYSTATE_MODE', 'FONDKAPREMONT_MODE','HOUSETYPE_MODE','WALLSMATERIAL_MODE']

numerical_features = df.columns.difference(categorical_features).values.tolist()

print('categorical({}) '.format(len(categorical_features)))
print('numeric({})'.format(len(numerical_features)))

categorical(52) 
numeric(69)


In [7]:
# Numerical column types are defaulted to floats
df[numerical_features] = df[numerical_features].astype("float")
df['TARGET'] = df['TARGET'].astype("float")

# Categorical features can't be set all at once
for f in categorical_features:
    df[f] = df[f].astype("category")

    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 121 entries, TARGET to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: category(52), float64(69)
memory usage: 177.1 MB


## Create train and test sets

In [8]:
# Partition data set into training/test using Stratefied split to maintain distribution
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df,df["TARGET"]):
    X_train = df.loc[train_index]
    X_test = df.loc[test_index]
    
# store labels    
y_train = pd.DataFrame(X_train.TARGET.loc[train_index])
y_test = pd.DataFrame(X_test.TARGET.loc[test_index])

# drop the target from the training and test set
X_train.drop('TARGET', inplace=True, axis=1)
X_test.drop('TARGET', inplace=True, axis=1)

## Pipeline with FeatureUnion
* NOTE: OneHotEncoder in Scikit-Learn 0.19 can only handle integer categorical inputs
* import it from future_encoders.py supplied from: https://github.com/ageron/handson-ml/blob/master/future_encoders.py. 
* When Scikit-Learn 0.20 is released, you can import it from sklearn.preprocessing instead.

In [9]:
X_train.shape

(246008, 120)

In [17]:
y_train[y_train['TARGET'] == 1].shape

(19860, 1)

In [12]:
# Pipeline classes
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)


class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [13]:
x_cols = [*X_train.columns.values]

In [14]:
# We can still get our column name from the OneHotEncoder object through its get_feature_names method.
preprocess_pipeline = make_pipeline(
    ColumnSelector(columns=x_cols),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            TypeSelector(np.float64),
            SimpleImputer(strategy="median", verbose=1),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            TypeSelector("category"),
            SimpleImputer(strategy="most_frequent", verbose=1),
            OneHotEncoder(sparse=False)
        ))
    ])
)

In [15]:
classifier_pipeline = make_pipeline(
    preprocess_pipeline,
    SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
)


param_grid = {
    "svc__gamma": [0.1 * x for x in range(1, 1)]
}


In [None]:
%%timeit -n 1 -r 1 -t x = range(10)
model = classifier_pipeline.fit(X_train, y_train.values.ravel())

In [None]:
classifier_model = GridSearchCV(classifier_pipeline, param_grid, cv=1)
classifier_model.fit(X_train, y_train)

y_score = classifier_model.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)
roc_auc = roc_auc_score(y_test, y_score)

# Plot ROC curve
plt.figure(figsize=(16, 12))
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)', size=16)
plt.ylabel('True Positive Rate (Sensitivity)', size=16)
plt.title('ROC Curve', size=20)
plt.legend(fontsize=14);

  y = column_or_1d(y, warn=True)


In [None]:
## column transformer alternative to the Feature Union

In [None]:
# Binning and encoding numeric columns with the new KBinsDiscretizer
https://medium.com/dunder-data/from-pandas-to-scikit-learn-a-new-exciting-workflow-e88e2271ef62

In [None]:
## revisit the problem of training a larger data set - upsample
https://elitedatascience.com/imbalanced-classes