In [8]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
def make_X_y():
    X, y = make_classification(n_samples=500)
    X = pd.DataFrame(X)
    X.columns = ['var_'+str(i) for i in range(0, X.shape[1])]
    y = pd.Series(y)
    nan_loc = [(2, 3), (17, 1), (4, 12)]
    for loc in nan_loc:
        X.iloc[loc] = np.nan
    
    return X, y

X, y = make_X_y()

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

In [12]:
model = Pipeline(
    steps = [
        ("Impute", SimpleImputer(strategy="mean")),
        ('scaler', MinMaxScaler()),
        ("clf", RandomForestClassifier(max_depth=3))
    ]
)

In [13]:
model.fit(X=X_train, y=y_train)

Pipeline(memory=None,
         steps=[('Impute',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('clf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=3, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
     

In [14]:
from sklearn.metrics import roc_auc_score

prediction = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, prediction))

0.943498452012384


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'clf__max_depth': [3, 5, 7],
    'clf__n_estimators': [10, 25, 50, 100],
    }

grid = RandomizedSearchCV(model, parameters, cv=3, n_iter=10).fit(X_train, y_train)

print('Training set score: ' + str(grid.score(X_train, y_train)))
print('Test set score: ' + str(grid.score(X_test, y_test)))

Training set score: 0.944
Test set score: 0.856


In [16]:
grid.best_params_

{'scaler': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'clf__n_estimators': 25,
 'clf__max_depth': 5}

In [17]:
grid.best_estimator_

Pipeline(memory=None,
         steps=[('Impute',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('clf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=5, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=25, n_jobs=None,
      

In [20]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,param_clf__n_estimators,param_clf__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.060088,0.002943,0.004997,0.000287,"MinMaxScaler(copy=True, feature_range=(0, 1))",50,5,"{'scaler': MinMaxScaler(copy=True, feature_ran...",0.856,0.872,0.904,0.877333,0.019956,7
1,0.030568,0.001245,0.002739,2e-05,"MinMaxScaler(copy=True, feature_range=(0, 1))",25,7,"{'scaler': MinMaxScaler(copy=True, feature_ran...",0.856,0.88,0.904,0.88,0.019596,3
2,0.053679,0.000254,0.004827,0.000231,"MinMaxScaler(copy=True, feature_range=(0, 1))",50,3,"{'scaler': MinMaxScaler(copy=True, feature_ran...",0.864,0.872,0.904,0.88,0.017282,3
3,0.015129,0.000495,0.001809,2.4e-05,"MinMaxScaler(copy=True, feature_range=(0, 1))",10,7,"{'scaler': MinMaxScaler(copy=True, feature_ran...",0.872,0.872,0.888,0.877333,0.007542,7
4,0.034575,0.000472,0.003136,0.000253,"MinMaxScaler(copy=True, feature_range=(0, 1))",25,5,"{'scaler': MinMaxScaler(copy=True, feature_ran...",0.872,0.872,0.904,0.882667,0.015085,1
5,0.07068,0.001159,0.005779,0.000548,"StandardScaler(copy=True, with_mean=True, with...",50,7,"{'scaler': StandardScaler(copy=True, with_mean...",0.856,0.88,0.904,0.88,0.019596,3
6,0.127981,0.002463,0.009026,0.00061,"StandardScaler(copy=True, with_mean=True, with...",100,3,"{'scaler': StandardScaler(copy=True, with_mean...",0.848,0.88,0.904,0.877333,0.02294,7
7,0.033921,0.0003,0.003364,0.000269,"StandardScaler(copy=True, with_mean=True, with...",25,3,"{'scaler': StandardScaler(copy=True, with_mean...",0.864,0.872,0.904,0.88,0.017282,3
8,0.121209,0.001894,0.008477,0.000814,"MinMaxScaler(copy=True, feature_range=(0, 1))",100,7,"{'scaler': MinMaxScaler(copy=True, feature_ran...",0.872,0.872,0.904,0.882667,0.015085,1
9,0.015374,0.00139,0.001959,0.00031,"StandardScaler(copy=True, with_mean=True, with...",10,7,"{'scaler': StandardScaler(copy=True, with_mean...",0.872,0.864,0.88,0.872,0.006532,10
