## Recursive Feature Elimination with Cross Validation (Wrapper Method)
### Dataset : https://www.kaggle.com/c/santander-customer-satisfaction/data?select=train.csv

In [48]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor

# It has 76k rows , 371 columns , hence taking only 100 rows
# since Time complexity is very high.
df = pd.read_csv("santander_dataset.csv", nrows=100)

df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,39205.17,0
1,3,2,34,0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,49278.03,0
2,4,2,23,0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,67333.77,0
3,8,2,37,0,195.0,195.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,64007.97,0
4,10,2,39,0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,117310.979016,0


In [51]:
x = df.drop("TARGET", axis="columns")
y = df["TARGET"]

column_names = x.columns # Gets the list of all column names except the Target.

column_names

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var29_ult3', 'saldo_medio_var33_hace2',
       'saldo_medio_var33_hace3', 'saldo_medio_var33_ult1',
       'saldo_medio_var33_ult3', 'saldo_medio_var44_hace2',
       'saldo_medio_var44_hace3', 'saldo_medio_var44_ult1',
       'saldo_medio_var44_ult3', 'var38'],
      dtype='object', length=370)

**Using SkLearn to implement RFECV. 
RFECV( ) Takes following parameters :
 1] Model
 2] Step size for elimination
 3] Minimum number of features to be selected.**

In [60]:
rfe_cv = RFECV(estimator=DecisionTreeRegressor(),min_features_to_select=50)

rfe_cv.fit(x, y)

RFECV(estimator=DecisionTreeRegressor())

In [65]:
# Gets a list containing boolean values -
# True for best selected features/ False for features that are eliminated.
boolean_list = rfe_cv.get_support()

print(boolean_list)

[False False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

In [66]:
optimal_features = column_names[boolean_list]

print("Total optimal features selected :{} ".format(len(optimal_features)))

Total optimal features selected :115 


In [68]:
eliminated_features = []

for column in column_names:
    if column not in optimal_features:
        eliminated_features.append(column)
        
print("Total number of optimal features selected : {}".format(len(eliminated_features)))

Total number of optimal features selected : 255


**Dropping the eliminated features.**

In [64]:
x = x[optimal_features]

x.head()

Unnamed: 0,imp_op_var40_comer_ult3,imp_op_var40_efect_ult3,imp_op_var40_ult1,imp_op_var41_efect_ult3,imp_op_var41_ult1,imp_op_var39_efect_ult3,imp_op_var39_ult1,imp_sal_var16_ult1,ind_var1,ind_var5_0,...,num_meses_var39_vig_ult3,num_op_var39_comer_ult1,num_op_var39_comer_ult3,num_op_var40_comer_ult1,num_op_var40_comer_ult3,num_op_var41_comer_ult1,num_op_var41_comer_ult3,num_op_var41_efect_ult1,num_op_var39_efect_ult1,num_var43_emit_ult1
0,0.0,0,0.0,0,0.0,0,0.0,0,0,1,...,2,0,0,0,0,0,0,0,0,0
1,0.0,0,0.0,0,0.0,0,0.0,0,0,1,...,2,0,0,0,0,0,0,0,0,0
2,0.0,0,0.0,0,0.0,0,0.0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0.0,0,0.0,0,195.0,0,195.0,0,0,1,...,1,9,9,0,0,9,9,0,0,0
4,0.0,0,0.0,0,0.0,0,0.0,0,0,1,...,2,0,0,0,0,0,0,0,0,3


**NOTE : Wrapper methods like RFECV are computationally very expensive and the Time complexity is also very large.**