In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
from sklearn import datasets
from model_selection import ClfSwitcher, RegSwitcher
import pandas as pd

In [3]:
# Increase the number of rows to show in notebook
pd.set_option("display.max_rows", 2000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [4]:
iris = datasets.load_iris()

### Without pipeline

#### Classification

In [5]:
parameters = [
    {
        'estimator': [RandomForestClassifier()],
        'estimator__n_estimators':[150, 200], 
        'estimator__max_depth':[2, 3]
    },
    {
        'estimator':[SGDClassifier()],
        'estimator__alpha': (1e-2, 1e-3, 1e-1)
    }
]

In [6]:
gs = GridSearchCV(ClfSwitcher(), parameters, cv=3, n_jobs=3, verbose = 10)
gs.fit(iris.data, iris.target)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    1.7s
[Parallel(n_jobs=3)]: Done  19 out of  21 | elapsed:    1.8s remaining:    0.1s
[Parallel(n_jobs=3)]: Done  21 out of  21 | elapsed:    1.8s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=ClfSwitcher(estimator=RandomForestClassifier(bootstrap=True,
                                                                    class_weight=None,
                                                                    criterion='gini',
                                                                    max_depth=None,
                                                                    max_features='auto',
                                                                    max_leaf_nodes=None,
                                                                    min_impurity_decrease=0.0,
                                                                    min_impurity_split=None,
                                                                    min_samples_leaf=1,
                                                                    min_samples_split=2,
                                                                

In [7]:
cv_res = pd.DataFrame(gs.cv_results_)

In [8]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__max_depth,param_estimator__n_estimators,param_estimator__alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0957,0.0064,0.0156,0.0,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,150.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9608,0.9412,0.9792,0.96,0.0154,1
1,0.1497,0.0154,0.0128,0.0038,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,200.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9608,0.9412,0.9792,0.96,0.0154,1
2,0.0938,0.0038,0.0101,0.0075,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,150.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9804,0.9216,0.9792,0.96,0.0276,1
3,0.1386,0.0127,0.0138,0.0051,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,200.0,,{'estimator': (DecisionTreeClassifier(class_we...,0.9804,0.9216,0.9792,0.96,0.0276,1
4,0.0027,0.0038,0.0007,0.001,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.01,"{'estimator': SGDClassifier(alpha=0.0001, aver...",0.9216,0.8824,0.7083,0.84,0.0918,6
5,0.0007,0.001,0.0034,0.0048,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.001,"{'estimator': SGDClassifier(alpha=0.0001, aver...",0.7059,0.6863,0.9583,0.78,0.1226,7
6,0.0027,0.0038,0.0,0.0,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.1,"{'estimator': SGDClassifier(alpha=0.0001, aver...",0.8627,0.7451,1.0,0.8667,0.1035,5


#### Regression

In [9]:
parameters = [
    {
        'estimator': [RandomForestRegressor()],
        'estimator__n_estimators':[150, 200], 
        'estimator__max_depth':[2, 3]
    },
    {
        'estimator':[LinearRegression()],
        'estimator__fit_intercept': [True, False]
    }
]

In [10]:
gs = GridSearchCV(RegSwitcher(), parameters, cv=3, n_jobs=3, verbose=10, scoring='neg_mean_squared_error')
gs.fit(iris.data, iris.target)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0859s.) Setting batch_size=4.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done   5 out of  18 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=3)]: Done  18 out of  18 | elapsed:    0.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RegSwitcher(estimator=RandomForestRegressor(bootstrap=True,
                                                                   criterion='mse',
                                                                   max_depth=None,
                                                                   max_features='auto',
                                                                   max_leaf_nodes=None,
                                                                   min_impurity_decrease=0.0,
                                                                   min_impurity_split=None,
                                                                   min_samples_leaf=1,
                                                                   min_samples_split=2,
                                                                   min_weight_fraction_leaf=0.0,
                                                                

In [11]:
cv_res = pd.DataFrame(gs.cv_results_)

In [12]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__max_depth,param_estimator__n_estimators,param_estimator__fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0631,0.0065,0.0105,0.0074,"RandomForestRegressor(bootstrap=True, criterio...",2.0,150.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.0806,-0.8468,-1.0,-0.9758,0.097,5
1,0.1352,0.0067,0.0137,0.0012,"RandomForestRegressor(bootstrap=True, criterio...",2.0,200.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.0698,-0.835,-1.0,-0.9683,0.0984,3
2,0.0794,0.0033,0.0069,0.0065,"RandomForestRegressor(bootstrap=True, criterio...",3.0,150.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.0725,-0.8371,-1.0,-0.9699,0.0985,4
3,0.0997,0.0184,0.002,0.0028,"RandomForestRegressor(bootstrap=True, criterio...",3.0,200.0,,{'estimator': RandomForestRegressor(bootstrap=...,-1.1182,-0.8301,-1.0,-0.9827,0.1182,6
4,0.002,0.0022,0.0007,0.0005,"LinearRegression(copy_X=True, fit_intercept=Fa...",,,True,"{'estimator': LinearRegression(copy_X=True, fi...",-0.5557,-0.1684,-0.3833,-0.3692,0.1584,2
5,0.0003,0.0005,0.0003,0.0005,"LinearRegression(copy_X=True, fit_intercept=Fa...",,,False,"{'estimator': LinearRegression(copy_X=True, fi...",-0.4434,-0.1488,-0.373,-0.3217,0.1256,1


In [13]:
model = RegSwitcher(RandomForestRegressor())
model.fit(iris.data, iris.target)



RegSwitcher(estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                            max_depth=None, max_features='auto',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=10, n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False))

In [14]:
model.score(iris.data, iris.target)

0.9889

### With pipeline

In [15]:
parameters = [
    {
        'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__n_estimators':[150, 200], 
        'clf__estimator__max_depth':[2, 3]
    },
    {
        'clf__estimator':[SGDClassifier()],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1)
    }
]

In [16]:
pipeline = Pipeline([('clf', ClfSwitcher())])

In [17]:
gs = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3)
gs.fit(iris.data, iris.target)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        ClfSwitcher(estimator=RandomForestClassifier(bootstrap=True,
                                                                                     class_weight=None,
                                                                                     criterion='gini',
                                                                                     max_depth=None,
                                                                                     max_features='auto',
                                                                                     max_leaf_nodes=None,
                                                                                     min_impurity_decrease=0.0,
                                                                                     min_impurity_split=None,
             

In [18]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('clf',
                 ClfSwitcher(estimator=RandomForestClassifier(bootstrap=True,
                                                              class_weight=None,
                                                              criterion='gini',
                                                              max_depth=2,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split=None,
                                                              min_samples_leaf=1,
                                                              min_samples_split=2,
                                                              min_weight_fraction_leaf=0.0,
                                         

In [19]:
cv_res = pd.DataFrame(gs.cv_results_)

In [20]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__estimator,param_clf__estimator__max_depth,param_clf__estimator__n_estimators,param_clf__estimator__alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0853,0.0016,0.0157,0.0,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,150.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9608,0.902,0.9792,0.9467,0.0329,4
1,0.1211,0.0074,0.0104,0.0074,"(DecisionTreeClassifier(class_weight=None, cri...",2.0,200.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9608,0.9412,0.9792,0.96,0.0154,1
2,0.0919,0.0102,0.0138,0.0026,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,150.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9804,0.9216,0.9792,0.96,0.0276,1
3,0.1254,0.0055,0.0093,0.0056,"(DecisionTreeClassifier(class_weight=None, cri...",3.0,200.0,,{'clf__estimator': (DecisionTreeClassifier(cla...,0.9804,0.9216,0.9792,0.96,0.0276,1
4,0.0,0.0,0.0,0.0,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.01,"{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.9412,0.7647,0.9167,0.8733,0.0786,5
5,0.0,0.0,0.0,0.0,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.001,"{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.7843,0.7843,0.7917,0.7867,0.0034,6
6,0.0027,0.0038,0.0,0.0,"SGDClassifier(alpha=0.0001, average=False, cla...",,,0.1,"{'clf__estimator': SGDClassifier(alpha=0.0001,...",0.7451,0.6667,0.6667,0.6933,0.0372,7


### Testing with homemade feature selector

In [21]:
import os
import sys

In [22]:
# Set the root path of the project
PROJECT_ROOT_PATH = os.path.dirname(os.path.abspath(os.path.join(os.getcwd())))
sys.path.append(PROJECT_ROOT_PATH)
sys.path.append(f'{PROJECT_ROOT_PATH}\ds_toolbox')

In [23]:
from feature_selection.feat_selector import *

In [30]:
model_cols = iris['feature_names']

In [25]:
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=model_cols + ['target'])

In [26]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [27]:
test = df.sample(frac = 0.2, replace = False)
train = df[~df.index.isin(test.index)]

In [28]:
available_strategies=RegFeatureSelector()._available_strategies

In [31]:
for strategy in available_strategies:
    print('\nStrategy=',strategy)
    fs=RegFeatureSelector(strategy=strategy)
    X_adj=fs.fit_transform(train[model_cols], train['target'])
    selected_cols=list(X_adj.columns)
    print('selected_cols=',len(selected_cols),sorted(selected_cols))


Strategy= variance
selected_cols= 3 ['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']

Strategy= l1
selected_cols= 0 []

Strategy= rf_feature_importance
selected_cols= 3 ['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']

Strategy= rf_top_features




selected_cols= 4 ['petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']

Strategy= stepwise
selected_cols= 3 ['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']


  return ptp(axis=axis, out=out, **kwargs)


#### Connecting model selector and feature selector using pipelines

In [33]:
available_strategies

['variance', 'l1', 'rf_feature_importance', 'rf_top_features', 'stepwise']

In [34]:
pipeline = Pipeline(
    [
        ('feature_selector', RegFeatureSelector()), 
        ('model', RegSwitcher())
    ]
)

In [45]:
parameters = [
    {
        'model__estimator': [RandomForestRegressor()],
        'model__estimator__n_estimators':[150, 200], 
        'model__estimator__max_depth':[2, 3],
        'feature_selector__strategy':['variance', 'rf_feature_importance', 'rf_top_features', 'stepwise']
    },
    {
        'model__estimator':[LinearRegression()],
        'model__estimator__fit_intercept': [True, False],
        'feature_selector__strategy':['variance', 'rf_feature_importance', 'rf_top_features', 'stepwise']
    }
]

In [46]:
gs = GridSearchCV(pipeline, parameters, cv=3, n_jobs=3, scoring='neg_mean_squared_error')
gs.fit(df[model_cols], df['target'])

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('feature_selector',
                                        <feature_selection.feat_selector.RegFeatureSelector object at 0x000001DBCD80CC88>),
                                       ('model',
                                        RegSwitcher(estimator=RandomForestRegressor(bootstrap=True,
                                                                                    criterion='mse',
                                                                                    max_depth=None,
                                                                                    max_features='auto',
                                                                                    max_leaf_nodes=None,
                                                                                    min_impurity_decrease=0...
                          'model__estimator__n_e

In [47]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('feature_selector',
                 <feature_selection.feat_selector.RegFeatureSelector object at 0x000001DBCDE5C708>),
                ('model',
                 RegSwitcher(estimator=LinearRegression(copy_X=True,
                                                        fit_intercept=False,
                                                        n_jobs=None,
                                                        normalize=False)))],
         verbose=False)

In [48]:
cv_res = pd.DataFrame(gs.cv_results_)

In [49]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feature_selector__strategy,param_model__estimator,param_model__estimator__max_depth,param_model__estimator__n_estimators,param_model__estimator__fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0647,0.0031,0.0133,0.01,variance,"RandomForestRegressor(bootstrap=True, criterio...",2.0,150.0,,"{'feature_selector__strategy': 'variance', 'mo...",-1.0732,-0.833,-1.0,-0.9687,0.1005,17
1,0.0825,0.001,0.0101,0.0001,variance,"RandomForestRegressor(bootstrap=True, criterio...",2.0,200.0,,"{'feature_selector__strategy': 'variance', 'mo...",-1.0627,-0.8382,-1.0,-0.967,0.0946,16
2,0.0799,0.0074,0.0074,0.0038,variance,"RandomForestRegressor(bootstrap=True, criterio...",3.0,150.0,,"{'feature_selector__strategy': 'variance', 'mo...",-1.0753,-0.8437,-1.0,-0.973,0.0965,19
3,0.1074,0.0044,0.0041,0.0044,variance,"RandomForestRegressor(bootstrap=True, criterio...",3.0,200.0,,"{'feature_selector__strategy': 'variance', 'mo...",-1.1084,-0.8361,-1.0,-0.9815,0.1119,23
4,0.3209,0.0211,0.0052,0.0074,rf_feature_importance,"RandomForestRegressor(bootstrap=True, criterio...",2.0,150.0,,{'feature_selector__strategy': 'rf_feature_imp...,-1.0098,-0.8267,-1.0,-0.9455,0.0841,10
5,0.3446,0.0135,0.0074,0.0064,rf_feature_importance,"RandomForestRegressor(bootstrap=True, criterio...",2.0,200.0,,{'feature_selector__strategy': 'rf_feature_imp...,-1.0116,-0.832,-1.0,-0.9479,0.0821,12
6,0.3163,0.0128,0.0104,0.0074,rf_feature_importance,"RandomForestRegressor(bootstrap=True, criterio...",3.0,150.0,,{'feature_selector__strategy': 'rf_feature_imp...,-1.002,-0.8368,-1.0,-0.9463,0.0774,11
7,0.3416,0.0093,0.0052,0.0074,rf_feature_importance,"RandomForestRegressor(bootstrap=True, criterio...",3.0,200.0,,{'feature_selector__strategy': 'rf_feature_imp...,-1.0032,-0.83,-1.0,-0.9444,0.0809,9
8,0.3551,0.0105,0.0052,0.0074,rf_top_features,"RandomForestRegressor(bootstrap=True, criterio...",2.0,150.0,,{'feature_selector__strategy': 'rf_top_feature...,-1.0662,-0.83,-1.0,-0.9654,0.0995,15
9,0.4167,0.0128,0.0052,0.0074,rf_top_features,"RandomForestRegressor(bootstrap=True, criterio...",2.0,200.0,,{'feature_selector__strategy': 'rf_top_feature...,-1.0483,-0.8257,-1.0,-0.958,0.0956,13
