In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import *

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Feature selection for regression

In [3]:
from sklearn.datasets import load_boston


X, y = load_boston(return_X_y=True)
X_tr, X_te, y_tr, y_te = sklearn.model_selection.train_test_split(X,y)

print(X_tr.shape, X_te.shape)

(379, 13) (127, 13)


In [4]:
scorer_mse = sklearn.metrics.mean_squared_error

## Univariate Feautre selection for regression

In [5]:
from sklearn.feature_selection import f_regression

In [6]:
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression, k=10).fit(X_tr, y_tr)
select_k_fregression.transform(X_tr).shape

(379, 10)

First let us try with a model without feature selection

In [7]:
model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[250,250])
model.fit(X_tr, y_tr)
tr_score = scorer_mse(model.predict(X_tr), y_tr)
te_score = scorer_mse(model.predict(X_te), y_te)

print(f"No Feature Selection -> train mse: {tr_score}, test mse: {te_score}")

No Feature Selection -> train mse: 22.95567346030943, test mse: 23.500933759073195


Now let us select some of the columns

In [8]:
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression, k=12).fit(X_tr, y_tr)

X_tr_new = select_k_fregression.transform(X_tr)
X_te_new = select_k_fregression.transform(X_te)

model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[250,250])

model.fit(X_tr_new, y_tr)
tr_score = scorer_mse(model.predict(X_tr_new), y_tr)
te_score = scorer_mse(model.predict(X_te_new), y_te)

print(f"Feature Selection -> train mse: {tr_score}, test mse: {te_score}")

Feature Selection -> train mse: 17.50010416563155, test mse: 18.741457965819734


Notice that the results in train improved but not in test.

How can we select the best `k` value ?  We can simply try many

In [9]:
from sklearn.metrics import make_scorer as make_scorer

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))  * 100

scoring_dict = {"MSE":  make_scorer(sklearn.metrics.mean_squared_error),
                "MAE":  make_scorer(sklearn.metrics.mean_absolute_error),
                "MAPE": make_scorer(mean_absolute_percentage_error),
                "R2":   make_scorer(sklearn.metrics.r2_score)}



In [10]:
model

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=[250, 250], learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression)
model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[250,250])
n_features = X_tr.shape[1]

pipe_model = sklearn.pipeline.Pipeline([("feature_selector", select_k_fregression), 
                                         ("model", model)])

pipe_grid = {"feature_selector__k":list(range(2,13)),
             "model__hidden_layer_sizes":[[250,250]]}

pipe_gridsearchcv = sklearn.model_selection.GridSearchCV(estimator=pipe_model,
                                                         cv=5,
                                                         param_grid=pipe_grid,
                                                         scoring=scoring_dict,
                                                         refit="R2",
                                                         return_train_score=True)

pipe_gridsearchcv.fit(X_tr, y_tr)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('feature_selector',
                                        SelectKBest(k=10,
                                                    score_func=<function f_regression at 0x1a1ee97488>)),
                                       ('model',
                                        MLPRegressor(activation='relu',
                                                     alpha=0.0001,
                                                     batch_size='auto',
                                                     beta_1=0.9, beta_2=0.999,
                                                     early_stopping=False,
                                                     epsilon=1e-08,
                                                     hidden_layer_sizes=[250,
                                                                         250],
                                        

In [12]:
def result_df(gridsearchcv, target_sortby="mean_test_score", metrics=[]):

    if len(metrics)==0:
        df = pd.concat([
                     pd.DataFrame(gridsearchcv.cv_results_["params"]),
                     pd.DataFrame({target_sortby: gridsearchcv.cv_results_[target_sortby]})
                  ],axis=1)
    else:
        # do this properly
        df = pd.concat([
                       pd.DataFrame(gridsearchcv.cv_results_["params"]),
                       pd.DataFrame(gridsearchcv.cv_results_[metrics]),
                       pd.DataFrame({target_sortby: gridsearchcv.cv_results_[target_sortby]})
                  ],axis=1)
        
    df.sort_values(by=target_sortby, ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


res_df = result_df(pipe_gridsearchcv, target_sortby="mean_test_R2")

#res_df = res_df.sort_values(by="mean_test_score", ascending=False)

In [13]:
res_df

Unnamed: 0,feature_selector__k,model__hidden_layer_sizes,mean_test_R2
0,4,"[250, 250]",0.738731
1,3,"[250, 250]",0.724248
2,2,"[250, 250]",0.706576
3,6,"[250, 250]",0.628101
4,11,"[250, 250]",0.621085
5,12,"[250, 250]",0.618198
6,7,"[250, 250]",0.612776
7,8,"[250, 250]",0.591045
8,10,"[250, 250]",0.589061
9,5,"[250, 250]",0.564396


we can see that the best result was achieved with 3 features

In [14]:
tr_score = scorer_mse(pipe_gridsearchcv.predict(X_tr), y_tr)
te_score = scorer_mse(pipe_gridsearchcv.predict(X_te), y_te)

print(f"Feature Selection pipe -> train mse: {tr_score}, test mse: {te_score}")

Feature Selection pipe -> train mse: 15.238384898850386, test mse: 13.322557651609241


# Performing feature selection on a subset of features

Let us consider a use case where we have to include some features from the data but we have the freedom to use or not use the other features.


For example, let us consider we have to include features 0 and 1 but we might want to do feature selection in the features 2 to 12. Then we can do the following. We first define a list of optional features.

```
optional_features = list(range(2,13))
```

Then we create a `sklearn.compose.ColumnTransformer` that performs a feature selection process to only `optional_features`.



In [50]:
optional_features = list(range(2,13))
optional_features

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [51]:
X_tr_df = pd.DataFrame(X_tr)
X_tr_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77
1,0.19073,22.0,5.86,0.0,0.431,6.718,17.5,7.8265,7.0,330.0,19.1,393.74,6.56
2,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59
3,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26
4,0.25915,0.0,21.89,0.0,0.624,5.693,96.0,1.7883,4.0,437.0,21.2,392.11,17.19


In [67]:
from sklearn.compose import ColumnTransformer

select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression,k=3)

column_trans = ColumnTransformer(
    [('feature_selection_subset', select_k_fregression, optional_features)],
    remainder='passthrough')

In [123]:
column_trans.fit(X_tr_df, y_tr)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('select_k',
                                 SelectKBest(k=10,
                                             score_func=<function f_regression at 0x1a1ee97488>),
                                 [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])],
                  verbose=False)

In [76]:
X_df_tr_transformed = column_trans.transform(X_tr_df)
X_tr_df.shape, X_df_tr_transformed.shape

((379, 13), (379, 5))

In [78]:
?column_trans.transform

Notice that `X_df_tr_transformed` has 5 columns even though we set `select_k_fregression` with k=3.
This is precisely what we want, since we have selected 3 columns form `optional_features`, leaving the 2 remaining columns intact.

Notice that now columns 0 and 1 have become 3 and 4 respectively and the data of those collumns remains intact  because the ColumnTransformer has  `remainder='passthrough'`.

In [75]:
pd.DataFrame(X_df_tr_transformed).head()

Unnamed: 0,0,1,2,3,4
0,6.383,17.3,5.77,0.02055,85.0
1,6.718,19.1,6.56,0.19073,22.0
2,6.438,19.1,3.59,0.21409,22.0
3,5.613,19.1,27.26,0.38735,0.0
4,5.693,21.2,17.19,0.25915,0.0


# Performing feature selection on a subset of features inside a Pipeline.


Let us consider we want to train a model (and the hyperparameters of the model) with the following requirement:

- All the features on the input dataframe have to be used, with the exception of the columns in `optional_features`.

In the example from the first section in this notebook we defined the `pipe_grid` as follows:

```
pipe_model = sklearn.pipeline.Pipeline([("feature_selector", select_k_fregression), 
                                        ("model", model)])
                                         
pipe_grid = {"feature_selector__k":list(range(2,13)),
             "model__hidden_layer_sizes":[[250,250]]}
```

with this definition the feature selection can choose to omit any feature. Therefore, this is not meeting the criteria that we specified before. 

We can enforce the feature selection to be done in the set of `optional_features` using a `ColumnTransformer` passing the `optional_features` as input and settig `remainder='passthorugh'` to make sure all the other features that are not in `optional_features` are used.


```
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression,k=3)

column_trans = ColumnTransformer(
    [('feature_selection_subset', select_k_fregression, optional_features)],
    remainder='passthrough')
 ```

In [140]:
# put the feature selection inside a ColumnTransformer with the optional_features
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression)
column_trans = ColumnTransformer([('select_k', select_k_fregression, optional_features)],
                                   remainder='passthrough')

model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[250,250])

pipe_model = sklearn.pipeline.Pipeline([("featselect", column_trans), 
                                         ("model", model)])

optional_features = list(range(2,13))

# this grid will test feature selection from 3 up to all the optional_features.
pipe_grid = {"featselect__select_k__k":list(range(3,len(optional_features))),
             "model__hidden_layer_sizes":[[250,250]]}

pipe_gridsearchcv = sklearn.model_selection.GridSearchCV(estimator=pipe_model,
                                                         cv=5,
                                                         param_grid=pipe_grid,
                                                         scoring=scoring_dict,
                                                         refit="R2",
                                                         return_train_score=True)


In [149]:
pipe_grid["featselect__select_k__k"]

[3, 4, 5, 6, 7, 8, 9, 10]

In [145]:
X_tr_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77
1,0.19073,22.0,5.86,0.0,0.431,6.718,17.5,7.8265,7.0,330.0,19.1,393.74,6.56
2,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59
3,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26
4,0.25915,0.0,21.89,0.0,0.624,5.693,96.0,1.7883,4.0,437.0,21.2,392.11,17.19


In [146]:
pipe_gridsearchcv.estimator.s

[('featselect',
  ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                    transformer_weights=None,
                    transformers=[('select_k',
                                   SelectKBest(k=10,
                                               score_func=<function f_regression at 0x1a1ee97488>),
                                   [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])],
                    verbose=False)),
 ('model',
  MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=[250, 250], learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='adam', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False))]

In [143]:
pipe_gridsearchcv.fit(X_tr_df, y_tr)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('featselect',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('select_k',
                                                                         SelectKBest(k=10,
                                                                                     score_func=<function f_regression at 0x1a1ee97488>),
                                                                         [2, 3,
                                                                          4, 5,
                                                         

In [144]:
tr_score = scorer_mse(pipe_gridsearchcv.predict(X_tr), y_tr)
te_score = scorer_mse(pipe_gridsearchcv.predict(X_te), y_te)

print(f"Feature Selection pipe -> train mse: {tr_score}, test mse: {te_score}")

Feature Selection pipe -> train mse: 12.768274078151013, test mse: 12.984042753994713


In [125]:
pipe_gridsearchcv.estimator.steps

[('featselect',
  ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                    transformer_weights=None,
                    transformers=[('select_k',
                                   SelectKBest(k=10,
                                               score_func=<function f_regression at 0x1a1ee97488>),
                                   [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])],
                    verbose=False)),
 ('model',
  MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=[250, 250], learning_rate='constant',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='adam', tol=0.0001,
               validation_fraction=0.1, verbose=False, warm_start=False))]

###### Keeping all features even if feature selection is in the pipeline

We can use all the features (do not perform feature selection by stating `featselect__select_k__k="all"`)

In [224]:
# put the feature selection inside a ColumnTransformer with the optional_features
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression)
column_trans = ColumnTransformer([('select_k', select_k_fregression, optional_features)],
                                   remainder='passthrough')

model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[200])

pipe_model = sklearn.pipeline.Pipeline([("featselect", column_trans), 
                                         ("model", model)])

In [225]:

# this grid will test feature selection from 3 up to all the optional_features.
pipe_grid = {"featselect__select_k__k":["all"],
             "model__hidden_layer_sizes":[[20,20]]}

pipe_gridsearchcv_david = sklearn.model_selection.GridSearchCV(estimator=pipe_model,
                                                         cv=5,
                                                         param_grid=pipe_grid,
                                                         scoring=scoring_dict,
                                                         refit="R2",
                                                         return_train_score=True,
                                                         )

In [234]:
X_tr_trans = pipe_gridsearchcv_david.best_estimator_.steps[0][1].transform(X_tr)

In [235]:
pd.DataFrame(X_tr_trans)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.02055,85.0,0.74,0.0,0.410,6.383,35.7,9.1876,2.0,313.0,17.3,396.90,5.77
1,0.19073,22.0,5.86,0.0,0.431,6.718,17.5,7.8265,7.0,330.0,19.1,393.74,6.56
2,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59
3,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26
4,0.25915,0.0,21.89,0.0,0.624,5.693,96.0,1.7883,4.0,437.0,21.2,392.11,17.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,4.54192,0.0,18.10,0.0,0.770,6.398,88.0,2.5182,24.0,666.0,20.2,374.56,7.79
375,0.07896,0.0,12.83,0.0,0.437,6.273,6.0,4.2515,5.0,398.0,18.7,394.92,6.78
376,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52
377,0.26938,0.0,9.90,0.0,0.544,6.266,82.8,3.2628,4.0,304.0,18.4,393.39,7.90


In [236]:
pd.DataFrame(X_tr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.02055,85.0,0.74,0.0,0.410,6.383,35.7,9.1876,2.0,313.0,17.3,396.90,5.77
1,0.19073,22.0,5.86,0.0,0.431,6.718,17.5,7.8265,7.0,330.0,19.1,393.74,6.56
2,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59
3,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26
4,0.25915,0.0,21.89,0.0,0.624,5.693,96.0,1.7883,4.0,437.0,21.2,392.11,17.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,4.54192,0.0,18.10,0.0,0.770,6.398,88.0,2.5182,24.0,666.0,20.2,374.56,7.79
375,0.07896,0.0,12.83,0.0,0.437,6.273,6.0,4.2515,5.0,398.0,18.7,394.92,6.78
376,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52
377,0.26938,0.0,9.90,0.0,0.544,6.266,82.8,3.2628,4.0,304.0,18.4,393.39,7.90
