In [1]:
#import librairies

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,r2_score

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline

from feature_engine.selection import DropConstantFeatures, DropCorrelatedFeatures

from mlxtend.feature_selection import SequentialFeatureSelector as SFS, ExhaustiveFeatureSelector as EFS

In [2]:
# Reading data|
df = pd.read_csv("./precleaned-datasets/dataset_2.csv")
housing = pd.read_csv(
    "./house-prices-advanced-regression-techniques/train.csv")

In [3]:
df.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109
0,4.53271,3.280834,17.982476,4.404259,2.34991,0.603264,2.784655,0.323146,12.009691,0.139346,...,2.079066,6.748819,2.941445,18.360496,17.726613,7.774031,1.473441,1.973832,0.976806,2.541417
1,5.821374,12.098722,13.309151,4.125599,1.045386,1.832035,1.833494,0.70909,8.652883,0.102757,...,2.479789,7.79529,3.55789,17.383378,15.193423,8.263673,1.878108,0.567939,1.018818,1.416433
2,1.938776,7.952752,0.972671,3.459267,1.935782,0.621463,2.338139,0.344948,9.93785,11.691283,...,1.861487,6.130886,3.401064,15.850471,14.620599,6.849776,1.09821,1.959183,1.575493,1.857893
3,6.02069,9.900544,17.869637,4.366715,1.973693,2.026012,2.853025,0.674847,11.816859,0.011151,...,1.340944,7.240058,2.417235,15.194609,13.553772,7.229971,0.835158,2.234482,0.94617,2.700606
4,3.909506,10.576516,0.934191,3.419572,1.871438,3.340811,1.868282,0.439865,13.58562,1.153366,...,2.738095,6.565509,4.341414,15.893832,11.929787,6.954033,1.853364,0.511027,2.599562,0.811364


In [4]:
housing.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
columns = [col for col in housing.columns if housing[col].dtype != "object"]

In [6]:
housing = housing[columns]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(
    labels=["target"], axis=1), df["target"], test_size=0.33, random_state=42)

In [8]:
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(housing.drop(
    labels=["SalePrice"], axis=1), housing["SalePrice"], test_size=0.33, random_state=42)

In [9]:
pipeline = Pipeline([
    ("drop_constante", DropConstantFeatures(tol=0.8)),
    ("drop_correlated", DropCorrelatedFeatures(variables=None,
                                               method='pearson',
                                               threshold=0.8,
                                               missing_values='ignore',
                                               ))
])

In [10]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [11]:
pipeline = Pipeline([
    ("drop_constante", DropConstantFeatures(tol=0.8)),
    ("drop_correlated", DropCorrelatedFeatures(variables=None,
                                               method='pearson',
                                               threshold=0.8,
                                               missing_values='ignore',
                                               ))
])

In [12]:
X_train_housing = pipeline.fit_transform(X_train_housing.fillna(0))
X_test_housing = pipeline.transform(X_test_housing)

# Step Forward Feature Selection

Will start by building all possible models containing one features, will select the feature that produces the best performing model, then will try all the models containing this features and another feature and will select the combination that produces the best performing model, and il will repeat the operation until a certain threshold is met (number of features or performance).

Most of the time, the performance will increase until a certain point where the difference is negligible.

## Classification

In [13]:
sfs = SFS(RandomForestClassifier(n_jobs=-1, n_estimators=10),
                  k_features=10,
                  forward=True, #Forward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="roc_auc",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [14]:
sfs.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   30.7s finished

[2021-02-12 21:05:16] Features: 1/10 -- score: 0.57854646135123[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  71 out of  71 | elapsed:   17.6s finished

[2021-02-12 21:05:34] Features: 2/10 -- score: 0.6512433846619194[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   15.8s finished

[2021-02-12 21:05:50] Features: 3/10 -- score: 0.6617118350682749[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  69 out

SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=10,
                                                           n_jobs=-1),
                          k_features=10, n_jobs=-1, scoring='roc_auc',
                          verbose=2)

In [15]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['var_6', 'var_9', 'var_16', 'var_21', 'var_27', 'var_45', 'var_55',
       'var_82', 'var_83', 'var_99'],
      dtype='object')

In [16]:
# function to train random forests and evaluate the performance

def run_randomForests_clf(X_train, X_test, y_train, y_test):

    rf = RandomForestClassifier(
        n_estimators=200, random_state=39, max_depth=4, n_jobs=-1)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = rf.predict_proba(X_test)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [17]:
run_randomForests_clf(X_train,X_test,y_train,y_test)

Train set
Random Forests roc-auc: 0.7117087155838488
Test set
Random Forests roc-auc: 0.6964315305753028


In [18]:
run_randomForests_clf(X_train[selected_feat],X_test[selected_feat],y_train,y_test)

Train set
Random Forests roc-auc: 0.7110104595721054
Test set
Random Forests roc-auc: 0.6989186228771722


#Slightly better model with fewer features!

## Regression

In [19]:
sfs = SFS(RandomForestRegressor(n_jobs=-1, n_estimators=10),
                  k_features=10,
                  forward=True, #Forward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="r2",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [20]:
sfs.fit(X_train_housing,y_train_housing)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  25 | elapsed:    0.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.5s finished

[2021-02-12 21:08:35] Features: 1/10 -- score: 0.6494742737986229[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  24 | elapsed:    0.3s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  19 out of  24 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    0.5s finished

[2021-02-12 21:08:36] Features: 2/10 -- score: 0.7007007747962867[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  23 | elapsed:    0.3s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  16 out of  23 | elapsed:    0.3s remaining:    0.2s
[Parallel

SequentialFeatureSelector(estimator=RandomForestRegressor(n_estimators=10,
                                                          n_jobs=-1),
                          k_features=10, n_jobs=-1, scoring='r2', verbose=2)

In [21]:
selected_feat = X_train_housing.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['MSSubClass', 'OverallQual', 'YearRemodAdd', 'BsmtFinSF1', 'GrLivArea',
       'FullBath', 'BedroomAbvGr', 'Fireplaces', 'GarageCars', 'YrSold'],
      dtype='object')

In [22]:
# function to train random forests and evaluate the performance

def run_randomForests_rgr(X_train, X_test, y_train, y_test):

    rf = RandomForestRegressor(
        n_estimators=200, random_state=39, max_depth=4, n_jobs=-1)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict(X_train)
    print('Random Forests roc-auc: {}'.format(r2_score(y_train, pred)))

    print('Test set')
    pred = rf.predict(X_test)
    print('Random Forests roc-auc: {}'.format(r2_score(y_test, pred)))

In [23]:
run_randomForests_rgr(X_train_housing,X_test_housing.fillna(0),y_train_housing,y_test_housing)

Train set
Random Forests roc-auc: 0.8671211082669377
Test set
Random Forests roc-auc: 0.8336650793250375


In [24]:
run_randomForests_rgr(X_train_housing[selected_feat],X_test_housing[selected_feat].fillna(0),y_train_housing,y_test_housing)

Train set
Random Forests roc-auc: 0.8555357099654526
Test set
Random Forests roc-auc: 0.8116331390706135


Close to same score but with fewer features.

# Step Backward Feature Selection

Will start by building a model containing all the features, and then will create models with n-1 features and see which one performs best, and then will create models from the previous best combination and remove another feature and select the one that perform best and repeat the operation until a certain threshold is met (number of features or performance).

Most of the time, the performance will be decreased but the difference is negligible until a certain point.


## Classification

In [None]:
sfs = SFS(RandomForestClassifier(n_jobs=-1, n_estimators=10),
                  k_features=68,
                  forward=False, #Backward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="roc_auc",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [26]:
sfs.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.5min finished

[2021-02-12 21:10:15] Features: 71/68 -- score: 0.6335645823247034[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done  71 out of  71 | elapsed:  1.5min finished

[2021-02-12 21:11:45] Features: 70/68 -- score: 0.6334800836103671[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  1.5min finished

[2021-02-12 21:13:13] Features: 69/68 -- score: 0.6320026972167317[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  6

SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=10,
                                                           n_jobs=-1),
                          forward=False, k_features=68, n_jobs=-1,
                          scoring='roc_auc', verbose=2)

In [27]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8',
       'var_9', 'var_10', 'var_11', 'var_12', 'var_13', 'var_14', 'var_15',
       'var_16', 'var_19', 'var_20', 'var_21', 'var_22', 'var_23', 'var_25',
       'var_26', 'var_27', 'var_30', 'var_34', 'var_35', 'var_36', 'var_37',
       'var_38', 'var_40', 'var_41', 'var_44', 'var_45', 'var_46', 'var_47',
       'var_48', 'var_49', 'var_50', 'var_51', 'var_53', 'var_55', 'var_56',
       'var_58', 'var_60', 'var_62', 'var_63', 'var_65', 'var_67', 'var_68',
       'var_69', 'var_71', 'var_73', 'var_78', 'var_79', 'var_81', 'var_82',
       'var_83', 'var_86', 'var_89', 'var_90', 'var_92', 'var_93', 'var_96',
       'var_98', 'var_99', 'var_103', 'var_107'],
      dtype='object')

In [28]:
run_randomForests_clf(X_train,X_test,y_train,y_test)

Train set
Random Forests roc-auc: 0.7117087155838488
Test set
Random Forests roc-auc: 0.6964315305753028


In [29]:
run_randomForests_clf(X_train[selected_feat],X_test[selected_feat],y_train,y_test)

Train set
Random Forests roc-auc: 0.7111783699603997
Test set
Random Forests roc-auc: 0.6955431723933649


Same score with less features!

## Regression

In [30]:
sfs = SFS(RandomForestRegressor(n_jobs=-1, n_estimators=10),
                  k_features=10,
                  forward=False, #Backward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="r2",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [31]:
sfs.fit(X_train_housing,y_train_housing)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  25 | elapsed:    0.5s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elapsed:    0.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.9s finished

[2021-02-12 21:14:44] Features: 24/10 -- score: 0.82198800606533[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  24 | elapsed:    0.5s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  19 out of  24 | elapsed:    0.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    0.9s finished

[2021-02-12 21:14:45] Features: 23/10 -- score: 0.8190223861782024[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  23 | elapsed:    0.5s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  16 out of  23 | elapsed:    0.6s remaining:    0.3s
[Parallel

SequentialFeatureSelector(estimator=RandomForestRegressor(n_estimators=10,
                                                          n_jobs=-1),
                          forward=False, k_features=10, n_jobs=-1, scoring='r2',
                          verbose=2)

In [32]:
selected_feat = X_train_housing.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['MSSubClass', 'LotArea', 'OverallQual', 'YearRemodAdd', 'BsmtFinSF1',
       'GrLivArea', 'BsmtFullBath', 'BedroomAbvGr', 'Fireplaces',
       'WoodDeckSF'],
      dtype='object')

In [33]:
run_randomForests_rgr(X_train_housing, X_test_housing.fillna(0),
                  y_train_housing, y_test_housing)

Train set
Random Forests roc-auc: 0.8671211082669377
Test set
Random Forests roc-auc: 0.8336650793250375


In [34]:
run_randomForests_rgr(X_train_housing[selected_feat], X_test_housing[selected_feat].fillna(
    0), y_train_housing, y_test_housing)

Train set
Random Forests roc-auc: 0.8520689067176929
Test set
Random Forests roc-auc: 0.7875834205375747


In [35]:
print(
    f"Original number of features: {X_train_housing.shape}, after backward selection: {X_train_housing[selected_feat].shape}")

Original number of features: (978, 25), after backward selection: (978, 10)


Slighty worst model with 15 features less!

# Exhaustive Feature Selection

Will try all the possible combination possible within a given dataset. Very CPU and time consuming, the MLxtend implementation has two hyperparameters which are min_features for the minimum number of features that we want and max_features for the maximum number of features we want and will try all the combination within this range.

## Classification

In [75]:
# Will build all the possible combinaison of 1 features, 2 features,
# up until n features and will select the best performing subset.
# Very demanding task in terms of computational.

# Stop criteria with mlextend of this algorithm is the min and max number of features.

efs = EFS(RandomForestClassifier(n_jobs=-1, n_estimators=10),
          min_features=70, #Min number of feature for the subset
          max_features=72, #Max number of feature for the subset
          print_progress=True,
          scoring='roc_auc',
          cv=5,
          n_jobs=-1,
          pre_dispatch='2*n_jobs',
          clone_estimator=True,
          )

In [76]:
efs.fit(X_train[:100],y_train[:100]) #sample to get a faster result for the experiment.

Features: 2629/2629

ExhaustiveFeatureSelector(estimator=RandomForestClassifier(n_estimators=10,
                                                           n_jobs=-1),
                          max_features=72, min_features=70, n_jobs=-1,
                          scoring='roc_auc')

In [77]:
selected_feat = X_train.columns[list(efs.best_idx_)]

In [78]:
print(f"Number of selected features: {len(selected_feat)}")

Number of selected features: 70


In [79]:
run_randomForests_clf(X_train,X_test,y_train,y_test)

Train set
Random Forests roc-auc: 0.7117087155838488
Test set
Random Forests roc-auc: 0.6964315305753028


In [80]:
run_randomForests_clf(X_train[selected_feat],X_test[selected_feat],y_train,y_test)

Train set
Random Forests roc-auc: 0.7113019601690855
Test set
Random Forests roc-auc: 0.697230285676672


Close to same results!

## Regression

In [63]:
# Will build all the possible combinaison of 1 features, 2 features,
# up until n features and will select the best performing subset.
# Very demanding task in terms of computational.

# Stop criteria with mlextend of this algorithm is the min and max number of features.

efs = EFS(RandomForestRegressor(n_jobs=-1, n_estimators=10),
          min_features=23, #Min number of feature for the subset
          max_features=25, #Max number of feature for the subset
          print_progress=True,
          scoring='r2',
          cv=5,
          n_jobs=1,
          pre_dispatch='2*n_jobs',
          clone_estimator=True,
          )

In [64]:
efs.fit(X_train_housing[:100],y_train_housing[:100]) #sample to get a faster result for the experiment.

Features: 326/326

ExhaustiveFeatureSelector(estimator=RandomForestRegressor(n_estimators=10,
                                                          n_jobs=-1),
                          max_features=25, min_features=23, scoring='r2')

In [70]:
selected_feat = X_train_housing.columns[list(efs.best_idx_)]

In [71]:
print(f"Number of selected features: {len(selected_feat)}")

Number of selected features: 23


In [72]:
run_randomForests_rgr(X_train_housing, X_test_housing.fillna(0),
                  y_train_housing, y_test_housing)

Train set
Random Forests roc-auc: 0.8671211082669377
Test set
Random Forests roc-auc: 0.8336650793250376


In [73]:
run_randomForests_rgr(X_train_housing[selected_feat], X_test_housing[selected_feat].fillna(
    0), y_train_housing, y_test_housing)

Train set
Random Forests roc-auc: 0.8663165919440726
Test set
Random Forests roc-auc: 0.8309313636840443


Close to the same score once again with less features!