In [20]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
data = pd.read_csv("../Large_data/feature/bnp-paribas-card-claim/train.csv", nrows=15000)  
data.shape

(50000, 133)

In [3]:
#feature selection should be done after data preprocessing
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape
#[col for col in data.columns if data[col].isnull().sum() > 0] #presence of null value

(50000, 114)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target","ID"], axis =1),
    data["target"],
    test_size = 0.3,
    random_state = 0
)
del data
X_train.shape, X_test.shape

((35000, 112), (15000, 112))

In [5]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j] > threshold):
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print("Correlated Features: ", len(set(corr_features)))

Correlated Features:  47


In [6]:
X_train.drop(labels=corr_features, axis =1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 65), (15000, 65))

In [8]:
sfs1 = SFS(RandomForestClassifier(n_jobs=4),
          k_features=7,
          forward=False,
          floating=False,
          verbose=2,
          scoring='roc_auc',
          cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.2s remaining:    0.0s

STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

In [19]:
selected_feat = X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['OverallQual', 'YearBuilt', 'GrLivArea'], dtype='object')

In [10]:
def run_randomForest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print(f'Random forest roc_auc: {roc_auc_score(y_train, pred[:,1])}')
    print('Test set')
    pred = rf.predict_proba(X_test)
    print(f'Random forest roc_auc: {roc_auc_score(y_test, pred[:,1])}')

In [11]:
run_randomForest(X_train[selected_feat].fillna(0),
                 X_test[selected_feat].fillna(0),
                y_train, y_test)

Train set
Random forest roc_auc: 0.7124253183462632
Test set
Random forest roc_auc: 0.6947725563833757


In [21]:
data = pd.read_csv("../Large_data/feature/house-prices/train.csv",) 
data.shape

(1460, 81)

In [22]:
#feature selection should be done after data preprocessing
numerics = ['int16','int32','int64','float16','float32','float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape
#[col for col in data.columns if data[col].isnull().sum() > 0] #presence of null value

(1460, 38)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["SalePrice"], axis =1),
    data["SalePrice"],
    test_size = 0.3,
    random_state = 0
)
del data
X_train.shape, X_test.shape


((1022, 37), (438, 37))

In [24]:
corr_features = correlation(X_train, 0.8)
print("Correlated Features: ", len(set(corr_features)))

Correlated Features:  3


In [25]:
X_train.drop(labels=corr_features, axis =1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((1022, 34), (438, 34))

In [27]:
sfs1 = SFS(RandomForestRegressor(),
          k_features=15,
          forward=False,
          floating=False,
          verbose=2,
          scoring='r2',
          cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:  1.1min finished

[2020-05-31 18:19:45] Features: 33/15 -- score: 0.857086719267344[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:  1.0min finished

[2020-05-31 18:20:47] Features: 32/15 -- score: 0.8588912058138046[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:   57.2s finished

[2020-05-31 18:21:44] Features: 31/15 -- score: 0.8602042918827385[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done

In [28]:
selected_feat = X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtUnfSF', 'GrLivArea', 'FullBath', 'KitchenAbvGr',
       'GarageCars', 'OpenPorchSF', '3SsnPorch', 'PoolArea', 'MoSold'],
      dtype='object')