# Custom Estimators

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import (BaseEstimator,
                          TransformerMixin,
                          ClassifierMixin,
                          RegressorMixin)

## Transformer

In [2]:
class Scaler(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.max_ = X.max().abs()
        return self
        
    def transform(self, X):
        return X / self.max_
    
    
X = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6]))
scaler = Scaler().fit(X)
scaler.transform(X).round(2)

Unnamed: 0,a,b
0,0.33,0.67
1,0.67,0.83
2,1.0,1.0


In [3]:
from sklearn.preprocessing import FunctionTransformer

def double(X):
    return X * 2

doubler = FunctionTransformer(double, validate=False)

doubler.fit(X)  # no es necesario, pero permite mantener una interfaz
                # uniforme
doubler.transform(X)

Unnamed: 0,a,b
0,2,8
1,4,10
2,6,12


## Regresor

In [4]:
class NullRegressor(BaseEstimator, RegressorMixin):
    
    def fit(self, X, y):
        self.mean_ = y.mean()
        return self
        
    def predict(self, X):
        return np.full(X.shape[0], self.mean_)
    

y = np.array([1, 2, 3])
y_ = NullRegressor().fit(X, y).predict(X)
X.assign(y=y, predicted=y_)


Unnamed: 0,a,b,y,predicted
0,1,4,1,2.0
1,2,5,2,2.0
2,3,6,3,2.0


## Classifier

In [5]:
class NullClassifier(BaseEstimator, ClassifierMixin):
    
    def fit(self, X, y):
        self.p_ = y.mean()
        return self
        
    def predict(self, X):
        return self.predict_proba(X)[:, 1] >= 0.5
    
    def predict_proba(self, X):
        p = np.full(X.shape[0], self.p_)
        return np.column_stack([1 - p, p]) 
    

y = np.array([1, 0, 0])
nc = NullClassifier().fit(X, y)
p_ = nc.predict_proba(X)
y_ = nc.predict(X)
X.assign(y=y, pred_proba_0=p_[:, 0], pred_proba_1=p_[:, 1], pred_class=y_)

Unnamed: 0,a,b,y,pred_proba_0,pred_proba_1,pred_class
0,1,4,1,0.666667,0.333333,False
1,2,5,0,0.666667,0.333333,False
2,3,6,0,0.666667,0.333333,False


---

# Pipeline

In [6]:
X = pd.DataFrame(dict(x1=[1, 2, 3, 4, 6],
                      x2=[3, 2, 3, 5, 8]))
y = 3 * X.x1 + 5 * X.x2
X.x1[2] = np.nan

X.assign(y=y)

Unnamed: 0,x1,x2,y
0,1.0,3,18
1,2.0,2,16
2,,3,24
3,4.0,5,37
4,6.0,8,58


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

transformer1 = SimpleImputer(missing_values=np.nan, strategy='mean')
transformer2 = MinMaxScaler((1, 2.5))
X1 = transformer1.fit_transform(X, y)
X2 = transformer2.fit_transform(X1, y)
pd.DataFrame(X2, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,1.0,1.25
1,1.3,1.0
2,1.675,1.25
3,1.9,1.75
4,2.5,2.5


In [8]:
from sklearn.pipeline import Pipeline, make_pipeline

pipe = make_pipeline(transformer1, transformer2)
Xt = pipe.fit_transform(X, y)
pd.DataFrame(Xt, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,1.0,1.25
1,1.3,1.0
2,1.675,1.25
3,1.9,1.75
4,2.5,2.5


In [9]:
pipe.steps

[('simpleimputer',
  SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                missing_values=nan, strategy='mean', verbose=0)),
 ('minmaxscaler', MinMaxScaler(copy=True, feature_range=(1, 2.5)))]

In [10]:
pipe.named_steps

{'simpleimputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='mean', verbose=0),
 'minmaxscaler': MinMaxScaler(copy=True, feature_range=(1, 2.5))}

In [11]:
pipe.steps[0]

('simpleimputer',
 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='mean', verbose=0))

In [12]:
pipe.named_steps['simpleimputer']

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [13]:
pipe = Pipeline([('t1', transformer1),
                 ('t2', transformer2)])
pipe.steps

[('t1', SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                missing_values=nan, strategy='mean', verbose=0)),
 ('t2', MinMaxScaler(copy=True, feature_range=(1, 2.5)))]

In [14]:
pipe.named_steps['t1']

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [15]:
pipe.set_params(t2__feature_range=(5, 8))
Xt = pipe.fit_transform(X, y)
pd.DataFrame(Xt, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,5.0,5.5
1,5.6,5.0
2,6.35,5.5
3,6.8,6.5
4,8.0,8.0


In [16]:
pipe.set_params(t2=None)
Xt = pipe.fit_transform(X, y)
pd.DataFrame(Xt, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,1.0,3.0
1,2.0,2.0
2,3.25,3.0
3,4.0,5.0
4,6.0,8.0


In [17]:
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

transformer1 = SimpleImputer(missing_values=np.nan, strategy='mean')
transformer2 = MinMaxScaler((1, 2.5))
regressor = LinearRegression()
X1 = transformer1.fit_transform(X, y)
X2 = transformer2.fit_transform(X1, y)
y_ = regressor.fit(X2, y).predict(X2)
X.assign(y=y, predicted=y_.round(2))

Unnamed: 0,x1,x2,y,predicted
0,1.0,3,18,18.12
1,2.0,2,16,15.68
2,,3,24,24.35
3,4.0,5,37,36.84
4,6.0,8,58,58.0


In [18]:
pipe = Pipeline([('t1', transformer1),
                 ('t2', transformer2),
                 ('r', regressor)])
y_ = pipe.fit(X2, y).predict(X2)
X.assign(y=y, predicted=y_.round(2))

Unnamed: 0,x1,x2,y,predicted
0,1.0,3,18,18.12
1,2.0,2,16,15.68
2,,3,24,24.35
3,4.0,5,37,36.84
4,6.0,8,58,58.0


---

# FeatureUnion

In [19]:
from sklearn.pipeline import FeatureUnion


def extract(columns):
    return FunctionTransformer(lambda X: X[columns], validate=False)

union = FeatureUnion([
    ('transform_x1', make_pipeline(extract(['x1']), transformer1)),
    ('transform_x2', make_pipeline(extract(['x2']), transformer2))
])

Xt = union.fit_transform(X, y)
pd.DataFrame(Xt, columns=['x1', 'x2'])      

Unnamed: 0,x1,x2
0,1.0,1.25
1,2.0,1.0
2,3.25,1.25
3,4.0,1.75
4,6.0,2.5


In [20]:
X.assign(transformed_x1=Xt[:, 0], transformed_x2=Xt[:, 1])

Unnamed: 0,x1,x2,transformed_x1,transformed_x2
0,1.0,3,1.0,1.25
1,2.0,2,2.0,1.0
2,,3,3.25,1.25
3,4.0,5,4.0,1.75
4,6.0,8,6.0,2.5


In [21]:
union.transformer_list

[('transform_x1', Pipeline(memory=None,
           steps=[('functiontransformer',
                   FunctionTransformer(accept_sparse=False, check_inverse=True,
                                       func=<function extract.<locals>.<lambda> at 0x000001BE61C2AE18>,
                                       inv_kw_args=None, inverse_func=None,
                                       kw_args=None, pass_y='deprecated',
                                       validate=False)),
                  ('simpleimputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='mean',
                                 verbose=0))],
           verbose=False)), ('transform_x2', Pipeline(memory=None,
           steps=[('functiontransformer',
                   FunctionTransformer(accept_sparse=False, check_inverse=True,
                                       func=<function extract.<locals>.<lambda> at 0x000001BE61C6E048>,

In [22]:
pipe = Pipeline([
    ('transform', FeatureUnion([
        ('transform_x1', make_pipeline(extract(['x1']), transformer1)),
        ('transform_x2', make_pipeline(extract(['x2']), transformer2))
    ])),
    ('regress', LinearRegression())
])
y_ = pipe.fit(X, y).predict(X)
X.assign(y=y, predicted=y_.round(2))

Unnamed: 0,x1,x2,y,predicted
0,1.0,3,18,18.12
1,2.0,2,16,15.68
2,,3,24,24.35
3,4.0,5,37,36.84
4,6.0,8,58,58.0


---

# Cross-validation

In [23]:
from sklearn.model_selection import KFold

X = pd.DataFrame(dict(x=[1, 2, 3, 4, 5, 6]))
s1, s2, s3 = KFold(3).split(X)


In [24]:
s1

(array([2, 3, 4, 5]), array([0, 1]))

In [25]:
s2

(array([0, 1, 4, 5]), array([2, 3]))

In [26]:
s3

(array([0, 1, 2, 3]), array([4, 5]))

In [27]:
for s in KFold(3).split(X):
    print(s)

(array([2, 3, 4, 5]), array([0, 1]))
(array([0, 1, 4, 5]), array([2, 3]))
(array([0, 1, 2, 3]), array([4, 5]))


In [28]:
KFold(3).split(X)

<generator object _BaseKFold.split at 0x000001BE61A41200>

In [29]:
list(KFold(3).split(X))

[(array([2, 3, 4, 5]), array([0, 1])),
 (array([0, 1, 4, 5]), array([2, 3])),
 (array([0, 1, 2, 3]), array([4, 5]))]

In [30]:
from sklearn.model_selection import LeaveOneOut

list(LeaveOneOut().split(X))

[(array([1, 2, 3, 4, 5]), array([0])),
 (array([0, 2, 3, 4, 5]), array([1])),
 (array([0, 1, 3, 4, 5]), array([2])),
 (array([0, 1, 2, 4, 5]), array([3])),
 (array([0, 1, 2, 3, 5]), array([4])),
 (array([0, 1, 2, 3, 4]), array([5]))]

In [31]:
from sklearn.model_selection import ShuffleSplit

list(ShuffleSplit(n_splits=5, test_size=0.33, random_state=0).split(X))

[(array([1, 3, 0, 4]), array([5, 2])),
 (array([4, 0, 2, 5]), array([1, 3])),
 (array([1, 2, 4, 0]), array([3, 5])),
 (array([3, 4, 1, 0]), array([5, 2])),
 (array([3, 5, 1, 0]), array([2, 4]))]

---

# Grid Search

In [32]:
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([('impute', SimpleImputer(missing_values=np.nan)),
                 ('scale', MinMaxScaler()),
                 ('regress', LinearRegression())])

grid = dict(impute__strategy=['mean', 'median'],
            scale__feature_range=[(1, 2), (4, 7)],
            regress__fit_intercept=[False, True])

from itertools import product
list(product(*grid.values()))

[('mean', (1, 2), False),
 ('mean', (1, 2), True),
 ('mean', (4, 7), False),
 ('mean', (4, 7), True),
 ('median', (1, 2), False),
 ('median', (1, 2), True),
 ('median', (4, 7), False),
 ('median', (4, 7), True)]

In [33]:
X = pd.DataFrame(dict(x1=[1, 2, 3, 4, 6],
                      x2=[3, 2, 3, 5, 8]))
y = 3 * X.x1 + 5 * X.x2
X.x1[2] = np.nan

gs = GridSearchCV(pipe, grid, return_train_score=False, cv=2, iid=True).fit(X, y)
gs.best_params_

{'impute__strategy': 'mean',
 'regress__fit_intercept': False,
 'scale__feature_range': (4, 7)}

In [34]:
gs.best_score_

-3.7291826505267105

In [35]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('impute',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('scale', MinMaxScaler(copy=True, feature_range=(4, 7))),
                ('regress',
                 LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [36]:
list(gs.cv_results_.keys())

['mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_impute__strategy',
 'param_regress__fit_intercept',
 'param_scale__feature_range',
 'params',
 'split0_test_score',
 'split1_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score']

In [37]:
gs.cv_results_['mean_test_score']

array([-12.16606469,  -3.72918265, -19.80124117, -19.80124117,
       -12.16606469,  -3.72918265, -19.80124117, -19.80124117])

In [38]:
gs.cv_results_['params']

[{'impute__strategy': 'mean',
  'regress__fit_intercept': False,
  'scale__feature_range': (1, 2)},
 {'impute__strategy': 'mean',
  'regress__fit_intercept': False,
  'scale__feature_range': (4, 7)},
 {'impute__strategy': 'mean',
  'regress__fit_intercept': True,
  'scale__feature_range': (1, 2)},
 {'impute__strategy': 'mean',
  'regress__fit_intercept': True,
  'scale__feature_range': (4, 7)},
 {'impute__strategy': 'median',
  'regress__fit_intercept': False,
  'scale__feature_range': (1, 2)},
 {'impute__strategy': 'median',
  'regress__fit_intercept': False,
  'scale__feature_range': (4, 7)},
 {'impute__strategy': 'median',
  'regress__fit_intercept': True,
  'scale__feature_range': (1, 2)},
 {'impute__strategy': 'median',
  'regress__fit_intercept': True,
  'scale__feature_range': (4, 7)}]

In [39]:
gs = GridSearchCV(pipe, grid, return_train_score=False, cv=KFold(2), iid=True).fit(X, y)
list(gs.cv_results_.keys())

['mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_impute__strategy',
 'param_regress__fit_intercept',
 'param_scale__feature_range',
 'params',
 'split0_test_score',
 'split1_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score']

In [40]:
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
                 ('scale', MinMaxScaler((1, 3))),
                 ('regress', None)])

grid = [dict(impute__strategy=['mean', 'median'],
             scale__feature_range=[(1, 2), (4, 7)],
             regress__fit_intercept=[False, True],
             regress=[LinearRegression()]),
        dict(regress__C=[1, 1e20],
             regress=[LogisticRegression(solver = 'liblinear')])]

def pp_grid(i):
    def pp_est(v):
        return v.__class__.__name__ if hasattr(v, 'fit') else v
    return list(product(*(map(pp_est, vs) for vs in grid[i].values())))

pp_grid(0) + pp_grid(1)

[('mean', (1, 2), False, 'LinearRegression'),
 ('mean', (1, 2), True, 'LinearRegression'),
 ('mean', (4, 7), False, 'LinearRegression'),
 ('mean', (4, 7), True, 'LinearRegression'),
 ('median', (1, 2), False, 'LinearRegression'),
 ('median', (1, 2), True, 'LinearRegression'),
 ('median', (4, 7), False, 'LinearRegression'),
 ('median', (4, 7), True, 'LinearRegression'),
 (1, 'LogisticRegression'),
 (1e+20, 'LogisticRegression')]

In [41]:
y = [1, 0, 0, 1, 1]
gs = GridSearchCV(pipe, grid, scoring='r2', return_train_score=False, cv=3, iid= True).fit(X, y)
gs.fit(X, y).best_params_



{'impute__strategy': 'mean',
 'regress': LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False),
 'regress__fit_intercept': False,
 'scale__feature_range': (1, 2)}

---

# Random Search

In [42]:
import scipy as sp

from sklearn.model_selection import RandomizedSearchCV

pipe = Pipeline([('impute', SimpleImputer(missing_values=np.nan)),
                 ('scale', MinMaxScaler()),
                 ('regress', LogisticRegression(solver='liblinear'))])

params = dict(impute__strategy=['mean', 'median'],
              scale__feature_range=[(1, 2), (4, 7)],
              regress__C=sp.stats.uniform(1, 1e10))

rs = RandomizedSearchCV(pipe, params, n_iter=5, cv=2, iid=True, return_train_score=False).fit(X, y)
rs.fit(X, y).best_params_

{'impute__strategy': 'mean',
 'regress__C': 5943255026.672545,
 'scale__feature_range': (1, 2)}

---