# Pipelining Demo

Purpose: 
- Demo Scikit-Learn's Pipeline Class. 
- Show why it's a valuable part of a data science machine learning pipeline.

By: Chris Gian, XPO Logistics
Sept. 16, 2018


# Agenda
1. Generate Data 
2. Show Pipeline Steps
    Transformations:
        - Multiply
        - Censor
        - Scale
    Classifer:
    - Toy Classifer: Mean Check
    
3. Execute / Optimize Pipeline
    - Define a grid of parameters to attempt
    - Optimize with respect to "scorer" function


# Generate Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import samples_generator
X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42)
X,y

(array([[ 0.61118028,  0.07396296, -0.49596905, ..., -0.51753365,
         -0.37339927, -0.70521074],
        [-0.55470506, -1.26634051, -1.03437283, ..., -0.05798395,
          0.07377011,  0.60247721],
        [ 0.72456704, -0.22624522,  1.28626861, ...,  1.06456868,
         -0.45374431,  0.44663973],
        ...,
        [ 1.25561121,  0.40561759,  1.5316888 , ...,  0.71500701,
          0.48056211,  0.40041203],
        [ 1.72707396, -0.00827807,  1.20562808, ...,  0.69476103,
          1.3238748 ,  0.93299664],
        [-0.47240735, -0.03014427,  1.7691167 , ..., -0.56770578,
          0.28012139,  0.3905229 ]]),
 array([0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
        0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1]))

## Pipeline Steps
Create Transformation Classes with methods:
- Fit
- Transform

"Skip" parameter to disable transform step

## Pipeline Steps
**Multiplier**

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
class multiplier(BaseEstimator,TransformerMixin):
    def __init__(self,weight, skip = False):
        self.weight = weight
        self.skip = skip
        
    def fit(self, X,y=None):
        return self
    
    def transform(self, X):
        if self.skip:
            return X
        else:
            return  X*self.weight
        
print(
"""
Before:
    {0}
After:
    {1}
""".format(X[:1],multiplier(weight =1.2).fit_transform(X)[:1]))



Before:
    [[ 0.61118028  0.07396296 -0.49596905  0.41540916  0.55537682 -0.70362006
  -1.17357021  0.96662085  0.34361859  0.22892446 -0.1681149   0.88246788
  -0.5989244  -0.6228705   1.04853094 -0.37655265  0.7959065  -0.51753365
  -0.37339927 -0.70521074]]
After:
    [[ 0.73341634  0.08875556 -0.59516286  0.498491    0.66645218 -0.84434408
  -1.40828425  1.15994502  0.41234231  0.27470935 -0.20173787  1.05896146
  -0.71870928 -0.7474446   1.25823713 -0.45186318  0.95508781 -0.62104038
  -0.44807912 -0.84625288]]



## Pipeline Steps
**Min Max Scaler**

In [3]:
from sklearn.preprocessing import MinMaxScaler
print(
    """
Before:
    {0}
After:
    {1}
    """.format(
        X[:1],
        MinMaxScaler().fit_transform(X)[:1]))


Before:
    [[ 0.61118028  0.07396296 -0.49596905  0.41540916  0.55537682 -0.70362006
  -1.17357021  0.96662085  0.34361859  0.22892446 -0.1681149   0.88246788
  -0.5989244  -0.6228705   1.04853094 -0.37655265  0.7959065  -0.51753365
  -0.37339927 -0.70521074]]
After:
    [[0.57360936 0.42336322 0.47433272 0.47648083 0.72273303 0.30206294
  0.3030466  0.6102198  0.57392337 0.6861751  0.46026553 0.54467303
  0.42583132 0.35792775 0.74513815 0.44273105 0.59644715 0.42316892
  0.4241669  0.30901421]]
    


## Pipeline Steps
**Censor**

In [4]:
class censor(BaseEstimator,TransformerMixin):
    def __init__(self,censor_threshold, skip = False
                ):
        self.censor_threshold = censor_threshold
        self.skip = skip
        
    def fit(self, X,y=None):
        return self
    
    def transform(self, X):
        if self.skip:
            return X
        else:
            return [[0 if i<self.censor_threshold else i for i in x] for x in X] 
print(
    """
Before:
    {0}
After:
    {1}
    """.format(X[:1],censor(censor_threshold=.1).fit_transform(X)[:1]))


Before:
    [[ 0.61118028  0.07396296 -0.49596905  0.41540916  0.55537682 -0.70362006
  -1.17357021  0.96662085  0.34361859  0.22892446 -0.1681149   0.88246788
  -0.5989244  -0.6228705   1.04853094 -0.37655265  0.7959065  -0.51753365
  -0.37339927 -0.70521074]]
After:
    [[0.6111802795578946, 0, 0, 0.41540916386044835, 0.5553768168069081, 0, 0, 0.9666208507629566, 0.34361858953521734, 0.22892445914495796, 0, 0.8824678818318511, 0, 0, 1.0485309385076005, 0, 0.795906504847474, 0, 0, 0]]
    


## Pipeline Steps

**Classifier**
- For each row, take column-wise mean.
- boolean of param >= tcolumn-wise mean value. 

Note: Classifier can be exchanged for elasticsearch lookup operation

In [5]:
from sklearn.base import ClassifierMixin
class MyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, param):
        self.param = param
        pass
    def fit(self, X, y):
         # Some code
        pass
    def predict(self, X):
        # average all columns
        X = X.mean(axis = 1)
        # if mean is >= lr, class = 1, else 0 
        y_out = [1 if x >= self.param else 0 for x in X]
        return y_out

## Pipeline Steps

**Classifier with confusion matrix**

In [6]:
from sklearn.metrics import confusion_matrix
test = MyClassifier(param=0)
test.fit(X,y)
y_pred = test.predict(X)
y_true = y
confusion_matrix(y_pred=y_pred, y_true=y_true)

array([[21, 29],
       [21, 29]])

## Pipeline Assembly

In [7]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('mult',multiplier(weight = 1)),
    ('censor',censor(censor_threshold= 1)),
    ('scale',MinMaxScaler(feature_range=(0,1))),
    ('Classifier',MyClassifier(param = .1))
])

# Pipeline Execution

In [8]:
# Instantiate Class
p = pipe
# Fit Method
p.fit(X,y)
# Predict Method
Y_pred = p.predict(X)
# First 10 Results
Y_pred[:10]

[0, 1, 1, 0, 0, 1, 1, 1, 0, 0]

# Optimization with Pipeline
## Get Parameters

In [9]:
pipe.get_params()

{'memory': None,
 'steps': [('mult', multiplier(skip=False, weight=1)),
  ('censor', censor(censor_threshold=1, skip=False)),
  ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
  ('Classifier', MyClassifier(param=0.1))],
 'mult': multiplier(skip=False, weight=1),
 'censor': censor(censor_threshold=1, skip=False),
 'scale': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'Classifier': MyClassifier(param=0.1),
 'mult__skip': False,
 'mult__weight': 1,
 'censor__censor_threshold': 1,
 'censor__skip': False,
 'scale__copy': True,
 'scale__feature_range': (0, 1),
 'Classifier__param': 0.1}

# Optimization with Pipeline
## Define Grid Sweep
This is where we can add, remove, alter transformation steps.

In [10]:
param_grid = {
    'Classifier__param': np.random.random_sample(3),
    'mult__skip': [False,True],
    'mult__weight': np.random.random_sample(3),
    'censor__skip': [False,True],
    'censor__censor_threshold':np.random.random_sample(3)
  }

# Optimization with Pipeline
## Executing Gridsearch
- two fold cross validation
- accuracy metric

[Customer Scorer](http://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html#sphx-glr-auto-examples-model-selection-plot-multi-metric-evaluation-py)


[Disable CV](https://stackoverflow.com/questions/44636370/scikit-learn-gridsearchcv-without-cross-validation-unsupervised-learning/44661188)




In [11]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, cv=2, param_grid=param_grid,return_train_score=True,scoring='accuracy')
grid.fit(X,y)

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('mult', multiplier(skip=False, weight=1)), ('censor', censor(censor_threshold=1, skip=False)), ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('Classifier', MyClassifier(param=0.1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'Classifier__param': array([0.23049, 0.53632, 0.70863]), 'mult__skip': [False, True], 'mult__weight': array([0.37311, 0.96173, 0.69051]), 'censor__skip': [False, True], 'censor__censor_threshold': array([0.94052, 0.28009, 0.42002])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

# Optimization with Pipeline
## Look at Best Performance

In [12]:
print(
    """
Best Estimator:
    {0}
    """.format(grid.best_estimator_))    


Best Estimator:
    Pipeline(memory=None,
     steps=[('mult', multiplier(skip=False, weight=0.3731063934850768)), ('censor', censor(censor_threshold=0.2800871331039577, skip=False)), ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('Classifier', MyClassifier(param=0.23049036517641797))])
    


# Optimization with Pipeline
## Look at Best Performance

In [13]:
df_out = pd.DataFrame(grid.cv_results_)[['rank_test_score','params','mean_test_score']]
params = pd.DataFrame([i for i in df_out.params])
params.columns = ["Parameter_"+i for i in params.columns]
pd.concat([df_out.drop(columns = 'params'),params], axis = 1).head(10)

Unnamed: 0,rank_test_score,mean_test_score,Parameter_Classifier__param,Parameter_censor__censor_threshold,Parameter_censor__skip,Parameter_mult__skip,Parameter_mult__weight
0,18,0.5,0.23049,0.940522,False,False,0.373106
1,15,0.52,0.23049,0.940522,False,False,0.961727
2,16,0.51,0.23049,0.940522,False,False,0.690506
3,2,0.54,0.23049,0.940522,False,True,0.373106
4,2,0.54,0.23049,0.940522,False,True,0.961727
5,2,0.54,0.23049,0.940522,False,True,0.690506
6,18,0.5,0.23049,0.940522,True,False,0.373106
7,18,0.5,0.23049,0.940522,True,False,0.961727
8,18,0.5,0.23049,0.940522,True,False,0.690506
9,18,0.5,0.23049,0.940522,True,True,0.373106
