In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
from sklearn import datasets
from model_selection import ClfSwitcher, RegSwitcher
import pandas as pd

In [3]:
# Increase the number of rows to show in notebook
pd.set_option("display.max_rows", 2000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [4]:
iris = datasets.load_iris()

### Without pipeline

#### Classification

In [5]:
parameters = [
    {
        'estimator': [RandomForestClassifier()],
        'estimator__n_estimators':[150, 200], 
        'estimator__max_depth':[2, 3]
    },
    {
        'estimator':[SGDClassifier()],
        'estimator__alpha': (1e-2, 1e-3, 1e-1)
    }
]

In [6]:
gs = GridSearchCV(ClfSwitcher(), parameters, cv=3, n_jobs=3, verbose = 10)
gs.fit(iris.data, iris.target)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    2.7s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    3.0s
[Parallel(n_jobs=3)]: Done  19 out of  21 | elapsed:    3.1s remaining:    0.2s
[Parallel(n_jobs=3)]: Done  21 out of  21 | elapsed:    3.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=ClfSwitcher(estimator=RandomForestClassifier(bootstrap=True,
                                                                    class_weight=None,
                                                                    criterion='gini',
                                                                    max_depth=None,
                                                                    max_features='auto',
                                                                    max_leaf_nodes=None,
                                                                    min_impurity_decrease=0.0,
                                                                    min_impurity_split=None,
                                                                    min_samples_leaf=1,
                                                                    min_samples_split=2,
                                                                

In [7]:
cv_res = pd.DataFrame(gs.cv_results_)

In [8]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__max_depth,param_estimator__n_estimators,param_estimator__alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.159,0.0041,0.022,0.0014,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,150.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9608,0.902,0.9792,0.9467,0.0329,4
1,0.201,0.0131,0.0247,0.0053,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,200.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9608,0.9216,0.9792,0.9533,0.024,3
2,0.1583,0.0045,0.0153,0.0012,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,150.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9804,0.9216,0.9792,0.96,0.0276,1
3,0.219,0.0059,0.037,0.0099,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,200.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9804,0.9216,0.9792,0.96,0.0276,1
4,0.003,0.0,0.0067,0.0034,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.01,"{'estimator': SGDClassifier(alpha=0.0001, aver...",0.7843,0.7451,0.875,0.8,0.0539,5
5,0.0027,0.0005,0.0033,0.0047,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.001,"{'estimator': SGDClassifier(alpha=0.0001, aver...",0.7059,0.8039,0.7292,0.7467,0.0422,7
6,0.003,0.0008,0.0007,0.0005,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.1,"{'estimator': SGDClassifier(alpha=0.0001, aver...",0.7647,0.9412,0.6875,0.8,0.1061,5


#### Regression

In [36]:
parameters = [
    {
        'estimator': [RandomForestRegressor()],
        'estimator__n_estimators':[150, 200], 
        'estimator__max_depth':[2, 3]
    },
    {
        'estimator':[LinearRegression()],
        'estimator__fit_intercept': [True, False]
    }
]

In [37]:
gs = GridSearchCV(RegSwitcher(), parameters, cv=3, n_jobs=3, verbose=10, scoring='neg_mean_squared_error')
gs.fit(iris.data, iris.target)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0911s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   5 out of  18 | elapsed:    0.2s remaining:    0.6s
[Parallel(n_jobs=3)]: Done  18 out of  18 | elapsed:    0.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RegSwitcher(estimator=RandomForestRegressor(bootstrap=True,
                                                                   criterion='mse',
                                                                   max_depth=None,
                                                                   max_features='auto',
                                                                   max_leaf_nodes=None,
                                                                   min_impurity_decrease=0.0,
                                                                   min_impurity_split=None,
                                                                   min_samples_leaf=1,
                                                                   min_samples_split=2,
                                                                   min_weight_fraction_leaf=0.0,
                                                                

In [38]:
cv_res = pd.DataFrame(gs.cv_results_)

In [39]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__max_depth,param_estimator__n_estimators,param_estimator__fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0758,0.0034,0.0069,0.0065,"RandomForestRegressor(bootstrap=True, criterio...",2.0,150.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.0558,-0.8309,-1.0,-0.9622,0.0956,3
1,0.1392,0.0166,0.0117,0.0017,"RandomForestRegressor(bootstrap=True, criterio...",2.0,200.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.073,-0.8518,-1.0,-0.9749,0.0921,4
2,0.0684,0.0058,0.0079,0.0064,"RandomForestRegressor(bootstrap=True, criterio...",3.0,150.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.1238,-0.8395,-1.0,-0.9878,0.1164,5
3,0.0862,0.0104,0.0122,0.0086,"RandomForestRegressor(bootstrap=True, criterio...",3.0,200.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.1245,-0.8435,-1.0,-0.9893,0.115,6
4,0.002,0.0014,0.0,0.0,"LinearRegression(copy_X=True, fit_intercept=Fa...",,,True,"{'estimator': LinearRegression(copy_X=True, fi...",-0.5557,-0.1684,-0.3833,-0.3692,0.1584,2
5,0.0,0.0,0.0,0.0,"LinearRegression(copy_X=True, fit_intercept=Fa...",,,False,"{'estimator': LinearRegression(copy_X=True, fi...",-0.4434,-0.1488,-0.373,-0.3217,0.1256,1


In [13]:
model = RegSwitcher(RandomForestRegressor())
model.fit(iris.data, iris.target)



RegSwitcher(estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                            max_depth=None, max_features='auto',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=10, n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False))

In [14]:
model.score(iris.data, iris.target)

0.9919

### With pipeline

In [9]:
parameters = [
    {
        'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__n_estimators':[150, 200], 
        'clf__estimator__max_depth':[2, 3]
    },
    {
        'clf__estimator':[SGDClassifier()],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1)
    }
]

In [10]:
pipeline = Pipeline([('clf', ClfSwitcher())])

In [11]:
gs = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3)
gs.fit(iris.data, iris.target)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        ClfSwitcher(estimator=RandomForestClassifier(bootstrap=True,
                                                                                     class_weight=None,
                                                                                     criterion='gini',
                                                                                     max_depth=None,
                                                                                     max_features='auto',
                                                                                     max_leaf_nodes=None,
                                                                                     min_impurity_decrease=0.0,
                                                                                     min_impurity_split=None,
             

In [12]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('clf',
                 ClfSwitcher(estimator=RandomForestClassifier(bootstrap=True,
                                                              class_weight=None,
                                                              criterion='gini',
                                                              max_depth=2,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split=None,
                                                              min_samples_leaf=1,
                                                              min_samples_split=2,
                                                              min_weight_fraction_leaf=0.0,
                                         

In [13]:
cv_res = pd.DataFrame(gs.cv_results_)

In [14]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__estimator,param_clf__estimator__max_depth,param_clf__estimator__n_estimators,param_clf__estimator__alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0907,0.0066,0.008,0.0,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,150.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9804,0.9216,0.9792,0.96,0.0276,4
1,0.1307,0.0071,0.0123,0.0033,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,200.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9804,0.9412,0.9792,0.9667,0.0183,1
2,0.097,0.0051,0.0077,0.0005,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,150.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9804,0.9216,1.0,0.9667,0.0333,1
3,0.1223,0.0041,0.01,0.0042,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,200.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9804,0.9412,0.9792,0.9667,0.0183,1
4,0.001,0.0,0.0003,0.0005,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.01,"{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.7843,0.7451,0.9375,0.82,0.0822,7
5,0.001,0.0,0.0007,0.0005,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.001,"{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.9608,0.902,0.8958,0.92,0.0294,5
6,0.001,0.0,0.0,0.0,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.1,"{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.9412,0.8039,0.9375,0.8933,0.0642,6


### Testing with homemade feature selector

In [18]:
import os
import sys

In [21]:
# Set the root path of the project
PROJECT_ROOT_PATH = os.path.dirname(os.path.abspath(os.path.join(os.getcwd())))
sys.path.append(PROJECT_ROOT_PATH)
sys.path.append(f'{PROJECT_ROOT_PATH}\ds_toolbox')

In [22]:
from feature_selection.feat_selector import *