In [9]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import *

In [10]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Feature selection for regression

In [11]:
from sklearn.datasets import load_boston


X, y = load_boston(return_X_y=True)
X_tr, X_te, y_tr, y_te = sklearn.model_selection.train_test_split(X,y)

print(X_tr.shape, X_te.shape)

(379, 13) (127, 13)


In [12]:
scorer_mse = sklearn.metrics.mean_squared_error

## Univariate Feautre selection for regression

In [13]:
from sklearn.feature_selection import f_regression

In [14]:
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression, k=10).fit(X_tr, y_tr)
select_k_fregression.transform(X_tr).shape

(379, 10)

First let us try with a model without feature selection

In [15]:
model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[250,250])
model.fit(X_tr, y_tr)
tr_score = scorer_mse(model.predict(X_tr), y_tr)
te_score = scorer_mse(model.predict(X_te), y_te)

print(f"No Feature Selection -> train mse: {tr_score}, test mse: {te_score}")

No Feature Selection -> train mse: 19.652610098868312, test mse: 28.437278050507192


Now let us select some of the columns

In [16]:
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression, k=12).fit(X_tr, y_tr)

X_tr_new = select_k_fregression.transform(X_tr)
X_te_new = select_k_fregression.transform(X_te)

model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[250,250])

model.fit(X_tr_new, y_tr)
tr_score = scorer_mse(model.predict(X_tr_new), y_tr)
te_score = scorer_mse(model.predict(X_te_new), y_te)

print(f"Feature Selection -> train mse: {tr_score}, test mse: {te_score}")

Feature Selection -> train mse: 18.685450536300294, test mse: 23.730383531118655


Notice that the results in train improved but not in test.

How can we select the best `k` value ?  We can simply try many

In [29]:
from sklearn.metrics import make_scorer as make_scorer

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))  * 100

scoring_dict = {"MSE":  make_scorer(sklearn.metrics.mean_squared_error),
                "MAE":  make_scorer(sklearn.metrics.mean_absolute_error),
                "MAPE": make_scorer(mean_absolute_percentage_error),
                "R2":   make_scorer(sklearn.metrics.r2_score)}

In [30]:
model

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=[250, 250], learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [31]:
select_k_fregression = sklearn.feature_selection.SelectKBest(f_regression)
model = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=[250,250])
n_features = X_tr.shape[1]

pipe_model = sklearn.pipeline.Pipeline([("feature_selector", select_k_fregression), 
                                         ("model", model)])

pipe_grid = {"feature_selector__k":list(range(2,13)),
             "model__hidden_layer_sizes":[[250,250]]}

pipe_gridsearchcv = sklearn.model_selection.GridSearchCV(estimator=pipe_model,
                                                         cv=5,
                                                         param_grid=pipe_grid,
                                                         scoring=scoring_dict,
                                                         refit="R2",
                                                         return_train_score=True)

pipe_gridsearchcv.fit(X_tr, y_tr)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('feature_selector',
                                        SelectKBest(k=10,
                                                    score_func=<function f_regression at 0x11e93de18>)),
                                       ('model',
                                        MLPRegressor(activation='relu',
                                                     alpha=0.0001,
                                                     batch_size='auto',
                                                     beta_1=0.9, beta_2=0.999,
                                                     early_stopping=False,
                                                     epsilon=1e-08,
                                                     hidden_layer_sizes=[250,
                                                                         250],
                                                     l

In [52]:
def result_df(gridsearchcv, target_sortby="mean_test_score", metrics=[]):

    if len(metrics)==0:
        df = pd.concat([
                     pd.DataFrame(gridsearchcv.cv_results_["params"]),
                     pd.DataFrame({target_sortby: gridsearchcv.cv_results_[target_sortby]})
                  ],axis=1)
    else:
        # do this properly
        df = pd.concat([
                       pd.DataFrame(gridsearchcv.cv_results_["params"]),
                       pd.DataFrame(gridsearchcv.cv_results_[metrics]),
                     pd.DataFrame({target_sortby: gridsearchcv.cv_results_[target_sortby]})
                  ],axis=1)
        
    df.sort_values(by=target_sortby, ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


res_df = result_df(pipe_gridsearchcv, target_sortby="mean_test_R2")

#res_df = res_df.sort_values(by="mean_test_score", ascending=False)

In [51]:
res_df

Unnamed: 0,feature_selector__k,model__hidden_layer_sizes,mean_test_R2
0,4,"[250, 250]",0.709597
1,3,"[250, 250]",0.688117
2,2,"[250, 250]",0.661144
3,5,"[250, 250]",0.610854
4,12,"[250, 250]",0.5992
5,9,"[250, 250]",0.593779
6,11,"[250, 250]",0.593178
7,6,"[250, 250]",0.589985
8,8,"[250, 250]",0.557692
9,7,"[250, 250]",0.539281


we can see that the best result was achieved with 3 features

In [44]:
tr_score = scorer_mse(pipe_gridsearchcv.predict(X_tr), y_tr)
te_score = scorer_mse(pipe_gridsearchcv.predict(X_te), y_te)

print(f"Feature Selection pipe -> train mse: {tr_score}, test mse: {te_score}")

Feature Selection pipe -> train mse: 14.970142487665388, test mse: 12.820463022240668
