In [1]:
#import librairies

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,r2_score

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline

from feature_engine.selection import DropConstantFeatures, DropCorrelatedFeatures

from mlxtend.feature_selection import SequentialFeatureSelector as SFS, ExhaustiveFeatureSelector as EFS

In [2]:
# Reading data|
df = pd.read_csv("./precleaned-datasets/dataset_2.csv")
housing = pd.read_csv(
    "./house-prices-advanced-regression-techniques/train.csv")

In [3]:
df.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109
0,4.53271,3.280834,17.982476,4.404259,2.34991,0.603264,2.784655,0.323146,12.009691,0.139346,...,2.079066,6.748819,2.941445,18.360496,17.726613,7.774031,1.473441,1.973832,0.976806,2.541417
1,5.821374,12.098722,13.309151,4.125599,1.045386,1.832035,1.833494,0.70909,8.652883,0.102757,...,2.479789,7.79529,3.55789,17.383378,15.193423,8.263673,1.878108,0.567939,1.018818,1.416433
2,1.938776,7.952752,0.972671,3.459267,1.935782,0.621463,2.338139,0.344948,9.93785,11.691283,...,1.861487,6.130886,3.401064,15.850471,14.620599,6.849776,1.09821,1.959183,1.575493,1.857893
3,6.02069,9.900544,17.869637,4.366715,1.973693,2.026012,2.853025,0.674847,11.816859,0.011151,...,1.340944,7.240058,2.417235,15.194609,13.553772,7.229971,0.835158,2.234482,0.94617,2.700606
4,3.909506,10.576516,0.934191,3.419572,1.871438,3.340811,1.868282,0.439865,13.58562,1.153366,...,2.738095,6.565509,4.341414,15.893832,11.929787,6.954033,1.853364,0.511027,2.599562,0.811364


In [4]:
housing.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
columns = [col for col in housing.columns if housing[col].dtype != "object"]

In [6]:
housing = housing[columns]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(
    labels=["target"], axis=1), df["target"], test_size=0.33, random_state=42)

In [8]:
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(housing.drop(
    labels=["SalePrice"], axis=1), housing["SalePrice"], test_size=0.33, random_state=42)

In [9]:
pipeline = Pipeline([
    ("drop_constante", DropConstantFeatures(tol=0.8)),
    ("drop_correlated", DropCorrelatedFeatures(variables=None,
                                               method='pearson',
                                               threshold=0.8,
                                               missing_values='ignore',
                                               ))
])

In [10]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [11]:
pipeline = Pipeline([
    ("drop_constante", DropConstantFeatures(tol=0.8)),
    ("drop_correlated", DropCorrelatedFeatures(variables=None,
                                               method='pearson',
                                               threshold=0.8,
                                               missing_values='ignore',
                                               ))
])

In [12]:
X_train_housing = pipeline.fit_transform(X_train_housing.fillna(0))
X_test_housing = pipeline.transform(X_test_housing)

# Step Forward Feature Selection

## Classification

In [13]:
sfs = SFS(RandomForestClassifier(n_jobs=-1, n_estimators=10),
                  k_features=10,
                  forward=True, #Forward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="roc_auc",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [None]:
sfs.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.2s


In [None]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

In [None]:
# function to train random forests and evaluate the performance

def run_randomForests_clf(X_train, X_test, y_train, y_test):

    rf = RandomForestClassifier(
        n_estimators=200, random_state=39, max_depth=4, n_jobs=-1)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = rf.predict_proba(X_test)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [None]:
run_randomForests_clf(X_train,X_test,y_train,y_test)

In [None]:
run_randomForests_clf(X_train[selected_feat],X_test[selected_feat],y_train,y_test)

#Slightly better model with fewer features!

## Regression

In [None]:
sfs = SFS(RandomForestRegressor(n_jobs=-1, n_estimators=10),
                  k_features=10,
                  forward=True, #Forward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="r2",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [None]:
sfs.fit(X_train_housing,y_train_housing)

In [None]:
selected_feat = X_train_housing.columns[list(sfs.k_feature_idx_)]
selected_feat

In [None]:
# function to train random forests and evaluate the performance

def run_randomForests_rgr(X_train, X_test, y_train, y_test):

    rf = RandomForestRegressor(
        n_estimators=200, random_state=39, max_depth=4, n_jobs=-1)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict(X_train)
    print('Random Forests roc-auc: {}'.format(r2_score(y_train, pred)))

    print('Test set')
    pred = rf.predict(X_test)
    print('Random Forests roc-auc: {}'.format(r2_score(y_test, pred)))

In [None]:
run_randomForests_rgr(X_train_housing,X_test_housing.fillna(0),y_train_housing,y_test_housing)

In [None]:
run_randomForests_rgr(X_train_housing[selected_feat],X_test_housing[selected_feat].fillna(0),y_train_housing,y_test_housing)

Close to same score but with fewer features.

# Step Backward Feature Selection

## Classification

In [None]:
sfs = SFS(RandomForestClassifier(n_jobs=-1, n_estimators=10),
                  k_features=68,
                  forward=False, #Backward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="roc_auc",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [None]:
sfs.fit(X_train,y_train)

In [None]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

In [None]:
run_randomForests_clf(X_train,X_test,y_train,y_test)

In [None]:
run_randomForests_clf(X_train[selected_feat],X_test[selected_feat],y_train,y_test)

Same score with less features!

## Regression

In [None]:
sfs = SFS(RandomForestRegressor(n_jobs=-1, n_estimators=10),
                  k_features=10,
                  forward=False, #Backward Feature Selection
                  floating=False,
                  verbose=2, #To display the steps
                  scoring="r2",
                  cv=5,
                  n_jobs=-1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
                  fixed_features=None,
                  )

In [None]:
sfs.fit(X_train_housing,y_train_housing)

In [None]:
selected_feat = X_train_housing.columns[list(sfs.k_feature_idx_)]
selected_feat

In [None]:
run_randomForests_rgr(X_train_housing, X_test_housing.fillna(0),
                  y_train_housing, y_test_housing)

In [None]:
run_randomForests_rgr(X_train_housing[selected_feat], X_test_housing[selected_feat].fillna(
    0), y_train_housing, y_test_housing)

In [None]:
print(
    f"Original number of features: {X_train_housing.shape}, after backward selection: {X_train_housing[selected_feat].shape}")

Slighty worst model with 15 features less!

# Exhaustive Feature Selection

## Classification

In [None]:
# Will build all the possible combinaison of 1 features, 2 features,
# up until n features and will select the best performing subset.
# Very computing demanding.

# Stop criteria with mlextend of this algorithm is the min and max number of features.

efs = EFS(RandomForestClassifier(n_jobs=-1, n_estimators=10),
          min_features=70, #Min number of feature for the subset
          max_features=72, #Max number of feature for the subset
          print_progress=True,
          scoring='roc_auc',
          cv=5,
          n_jobs=-1,
          pre_dispatch='2*n_jobs',
          clone_estimator=True,
          )

In [None]:
efs.fit(X_train,y_train)

In [None]:
selected_feat = X_train.columns[list(efs.best_idx_)]

In [None]:
run_randomForests_clf(X_train,X_test,y_train,y_test)

In [None]:
run_randomForests_clf(X_train[selected_feat],X_test[selected_feat],y_train,y_test)

## Regression

In [None]:
# Will build all the possible combinaison of 1 features, 2 features,
# up until n features and will select the best performing subset.
# Very computing demanding.

# Stop criteria with mlextend of this algorithm is the min and max number of features.

efs = EFS(RandomForestRegressor(n_jobs=-1, n_estimators=10),
          min_features=70, #Min number of feature for the subset
          max_features=72, #Max number of feature for the subset
          print_progress=True,
          scoring='r2',
          cv=5,
          n_jobs=1,
          pre_dispatch='2*n_jobs',
          clone_estimator=True,
          )

In [None]:
efs.fit(X_train_housing,y_train_housing)

In [None]:
selected_feat = X_train.columns[list(efs.best_idx_)]

In [None]:
run_randomForests_rgr(X_train_housing, X_test_housing.fillna(0),
                  y_train_housing, y_test_housing)

In [None]:
run_randomForests_rgr(X_train_housing[selected_feat], X_test_housing[selected_feat].fillna(
    0), y_train_housing, y_test_housing)