In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [13]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
diabetes_X_scale = preprocessing.scale(diabetes.data)
print(diabetes.data.shape)
print(diabetes.target.shape)
print(diabetes_X_scale.shape)

(442, 10)
(442,)
(442, 10)


In [11]:
np.apply_along_axis(np.mean, 0, diabetes.data)

array([-3.63962254e-16,  1.30991246e-16, -8.01395149e-16,  1.28981793e-16,
       -9.04254047e-17,  1.30112110e-16, -4.56397112e-16,  3.86317424e-16,
       -3.84810333e-16, -3.39848813e-16])

In [12]:
np.apply_along_axis(np.std, 0, diabetes.data)

array([0.04756515, 0.04756515, 0.04756515, 0.04756515, 0.04756515,
       0.04756515, 0.04756515, 0.04756515, 0.04756515, 0.04756515])

In [5]:
np.apply_along_axis(np.mean, 0, diabetes_X_scale)

array([-3.21512550e-17, -8.03781375e-17,  4.82268825e-17,  5.62646963e-17,
       -4.82268825e-17, -5.62646963e-17,  4.42079756e-17,  2.25058785e-16,
       -4.42079756e-17,  3.21512550e-17])

In [10]:
np.apply_along_axis(np.std, 0, diabetes_X_scale)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

# test customization

In [99]:
class Foo:
    def fit(self, x): 
        pass
    
    def transform(self, x):
        return x

In [84]:
sc = preprocessing.StandardScaler()
sc.fit(np.random.random((10,2)))

pipe = Pipeline(steps=[
    ('scale', sc),
    ('foo', Foo),
    ('linear', linear_model.LinearRegression)
])
pipe

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('foo', <class '__main__.Foo'>), ('linear', <class 'sklearn.linear_model.base.LinearRegression'>)])

In [85]:
pipe.fit

<bound method Pipeline.fit of Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('foo', <class '__main__.Foo'>), ('linear', <class 'sklearn.linear_model.base.LinearRegression'>)])>

-----

# Try 01

In [87]:
sc = preprocessing.StandardScaler()
pipe = Pipeline([
    ('std_scaler', sc),
])

In [96]:
x = np.arange(10).reshape(5, 2).astype(np.float64)
x_trans = sc.fit_transform(x)
print(x_trans)

[[-1.41421356 -1.41421356]
 [-0.70710678 -0.70710678]
 [ 0.          0.        ]
 [ 0.70710678  0.70710678]
 [ 1.41421356  1.41421356]]


In [98]:
x_pipe = pipe.fit_transform(x)
print(x_pipe)

[[-1.41421356 -1.41421356]
 [-0.70710678 -0.70710678]
 [ 0.          0.        ]
 [ 0.70710678  0.70710678]
 [ 1.41421356  1.41421356]]


# Try 02

In [106]:
class Foo1:
    def __init__(self):
        self.val = 0
        
    def fit(self, x): 
        self.val = 1
        return self
    
    def transform(self, x):
        return self.val * x
    
class Foo2:
    def __init__(self):
        self.val = 0
        
    def fit(self, x): 
        #self.val = 2
        return self
        
    def transform(self, x):
        return self.val * x

In [107]:
pipe = Pipeline(steps=[
    ('foo 1', Foo1),
    ('foo 2', Foo2)
])
pipe

Pipeline(memory=None,
     steps=[('foo 1', <class '__main__.Foo1'>), ('foo 2', <class '__main__.Foo2'>)])

In [108]:
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [109]:
pipe.fit(x)

AttributeError: 'numpy.ndarray' object has no attribute 'val'

# try 03

[amberjrivera/Pipeline-guide](https://gist.github.com/amberjrivera/8c5c145516f5a2e894681e16a8095b5c)

**A Simple Pipeline Example:**  

All estimators in a pipeline, except for the last one, must be transformers (i.e. they take X, do something to X, and then spit out a transformed X). 

The final estimator can be another transformer, classifer, regressor, etc.

**Custom Transformers**

Often during preprocessing and feature selection, we write our own functions that transform the data (e.g. drop columns, multiply two columns together, etc.). 

To incorporate those actions into your pipeline, you'll likely need to write your own transformer class.

- `TransformerMixin` gives your transformer the very useful `.fit_transform` method
- `BaseEstimator` gives your transformer grid-searchable parameters. This becomes very important later.
- `fit` **ALWAYS returns self.** 
    - Sometimes it can set state variables if you will need those to transform test data later on. Otherwise it just does nothing. Either way, it returns self.

In [75]:
class MyTransformer(TransformerMixin, BaseEstimator):
    '''A template for a custom transformer.'''

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # transform X via code or additional methods
        return X

In [76]:
pipe = Pipeline(steps=[
    ('foo', MyTransformer)
])
pipe

Pipeline(memory=None, steps=[('foo', <class '__main__.MyTransformer'>)])

In [81]:
pipe.fit(x)

Pipeline(memory=None, steps=[('foo', <class '__main__.MyTransformer'>)])

In [82]:
pipe.transform(x)

TypeError: transform() missing 1 required positional argument: 'X'

# Searching

[Pipelines and Custom Transfomers in SKLearn](https://bradzzz.gitbooks.io/ga-seattle-dsi/content/dsi/dsi_05_classification_databases/2.2-lesson/readme.html#guided-practice)

Custom Transformers:  
We can implement custom transformers by extending the BaseClass in Scikit-Learn.

In [56]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class FeatureMultiplier(BaseEstimator, TransformerMixin):
    def __init__(self, factor):
        self.factor = factor

    def transform(self, X, *_):
        return X * self.factor

    def fit(self, *_):
        return self

fm = FeatureMultiplier(2)

test = np.diag((1,2,3,4))
print(test)

fm.transform(test)

[[1 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 4]]


array([[2, 0, 0, 0],
       [0, 4, 0, 0],
       [0, 0, 6, 0],
       [0, 0, 0, 8]])

[Building a custom Python scikit-learn transformer for machine learning.](https://opendevincode.wordpress.com/2015/08/01/building-a-custom-python-scikit-learn-transformer-for-machine-learning/)

In [57]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

class KVExtractor(TransformerMixin):
    def __init__(self, kvpairs):
        self.kpairs = kvpairs
        
    def transform(self, X, *_):
        result = []
        for index, rowdata in X.iterrows():
            rowdict = {}
            for kvp in self.kpairs:
                rowdict.update( { rowdata[ kvp[0] ]: rowdata[ kvp[1] ] } )
            result.append(rowdict)
        return result
    
    def fit(self, *_):
        return self

In [59]:
import pandas as pd

D = pd.DataFrame([ ['a', 1, 'b', 2], ['b', 2, 'c', 3]], columns = ['k1', 'v1', 'k2', 'v2'])
print(D)

  k1  v1 k2  v2
0  a   1  b   2
1  b   2  c   3


In [60]:
kvpairs = [ ['k1', 'v1'], ['k2', 'v2'] ]
KVExtractor( kvpairs ).transform(D)

[{'a': 1, 'b': 2}, {'b': 2, 'c': 3}]

[Creating your own estimator in scikit-learn](http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/)

In [62]:
from sklearn.base import BaseEstimator, ClassifierMixin

class MeanClassifier(BaseEstimator, ClassifierMixin):  
    """An example of classifier"""

    def __init__(self, intValue=0, stringParam="defaultValue", otherParam=None):
        """
        Called when initializing the classifier
        """
        self.intValue = intValue
        self.stringParam = stringParam

        # THIS IS WRONG! Parameters should have same name as attributes
        self.differentParam = otherParam 


    def fit(self, X, y=None):
        """
        This should fit classifier. All the "work" should be done here.

        Note: assert is not a good choice here and you should rather
        use try/except blog with exceptions. This is just for short syntax.
        """

        assert (type(self.intValue) == int), "intValue parameter must be integer"
        assert (type(self.stringParam) == str), "stringValue parameter must be string"
        assert (len(X) == 20), "X must be list with numerical values."

        self.treshold_ = (sum(X)/len(X)) + self.intValue  # mean + intValue

        return self

    def _meaning(self, x):
        # returns True/False according to fitted classifier
        # notice underscore on the beginning
        return( True if x >= self.treshold_ else False )

    def predict(self, X, y=None):
        try:
            getattr(self, "treshold_")
        except AttributeError:
            raise RuntimeError("You must train classifer before predicting data!")

        return([self._meaning(x) for x in X])

    def score(self, X, y=None):
        # counts number of values bigger than mean
        return(sum(self.predict(X))) 

In [64]:
from sklearn.grid_search import GridSearchCV

X_train = [i for i in range(0, 100, 5)]  
X_test = [i + 3 for i in range(-5, 95, 5)]  
tuned_params = {"intValue" : [-10,-1,0,1,10]}

gs = GridSearchCV(MeanClassifier(), tuned_params)

In [65]:
# for some reason I have to pass y with same shape
# otherwise gridsearch throws an error. Not sure why.
gs.fit(X_test, y=[1 for i in range(20)])

gs.best_params_

AssertionError: X must be list with numerical values.