# **Recursive Feature Elimination with Random Forest Selecting features for model performance**

 ## Table of contents
>1. [Dataset](#Dataset)
>2. [Random Forest Classifier](#Random_Forest_Classifier)
>3. [Recursive feature elemination - Random forest](#Recursive_feature_elemination_Random_forest)


## Library

In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn import metrics

## Dataset <a name="Dataset"></a>

In [50]:
X_train = pd.read_csv('./data/X_train.csv').to_numpy()[1:,1:]
y_train = pd.read_csv('./data/y_train.csv').to_numpy()[1:,1:].astype('int')
X_test = pd.read_csv('./data/X_test.csv').to_numpy()[1:,1:]
y_test = pd.read_csv('./data/y_test.csv').to_numpy()[1:,1:].astype('int')

In [51]:
#check dataset
print("Training data: X_train = ",X_train.shape, "y_train = ", y_train.shape)
print("Testing data: X_test = ",X_test.shape, "y_test = ", y_test.shape)

Training data: X_train =  (110, 136) y_train =  (110, 1)
Testing data: X_test =  (27, 136) y_test =  (27, 1)


In [52]:
y_test

array([[1],
       [2],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [1],
       [2],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [2],
       [1],
       [1],
       [1]])

## Random Forest Classifier <a name="Random_Forest_Classifier"></a>

In [53]:
def random_forest_classifier(X_train, y_train, X_test, y_test):
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()   
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    X_test_scl = scaler.fit_transform(X_test)
    
    # convert matrix 2d -> 1d (by flatten method)    
    y_train = y_train.ravel()
    # y_train = y_train.astype('int')
    y_test = y_test.ravel()
    # y_test = y_test.astype('int')

    
    # Random forest classifier model
    rf = RandomForestClassifier(random_state=0, n_jobs=-1)
    rf.fit(X_train_scl, y_train)
        
    # check classes
    print("classes model: ")
    print(rf.classes_)
    
    # Predict testing dataset
    y_pred = rf.predict(X_test)
    # Predict probability (1, 2) 
    y_score = rf.predict_proba(X_test)
    print(y_score)
    
    # ROC curve: receiver operating characteristic curve
    # AUC: Area Under the ROC Curve 
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,1], pos_label=2)
    auc = metrics.auc(fpr, tpr)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
#     tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
#     confusion = (tn, fp, fn, tp)
    confusion = metrics.confusion_matrix(y_test, y_pred)
    return rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion 

In [54]:
%%time
# check efficiency random forest classifier for all dataset (train, test)
rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = random_forest_classifier(X_train, y_train, X_test, y_test)

classes model: 
[1 2]
[[0.34 0.66]
 [0.48 0.52]
 [0.39 0.61]
 [0.43 0.57]
 [0.39 0.61]
 [0.36 0.64]
 [0.5  0.5 ]
 [0.39 0.61]
 [0.47 0.53]
 [0.42 0.58]
 [0.48 0.52]
 [0.4  0.6 ]
 [0.43 0.57]
 [0.45 0.55]
 [0.38 0.62]
 [0.5  0.5 ]
 [0.41 0.59]
 [0.46 0.54]
 [0.35 0.65]
 [0.49 0.51]
 [0.51 0.49]
 [0.4  0.6 ]
 [0.37 0.63]
 [0.45 0.55]
 [0.29 0.71]
 [0.38 0.62]
 [0.47 0.53]]
Wall time: 151 ms


In [55]:
print("auc_score = "+ str(auc))

auc_score = 0.3664772727272727


In [56]:
feature_importance = rf.feature_importances_
features_index = np.arange(0, X_train.shape[1], 1, dtype=int)
f_i = list(zip(features_index, feature_importance))
f_i

[(0, 0.003594062124327922),
 (1, 0.0025301895075575953),
 (2, 0.0032188861528172972),
 (3, 0.0025015692621547353),
 (4, 0.0021494995830908053),
 (5, 0.0019324221751055276),
 (6, 0.001771676621297949),
 (7, 0.0027232607863657845),
 (8, 0.004613738133469662),
 (9, 0.005208497505863061),
 (10, 0.008735133089557175),
 (11, 0.003400420504177867),
 (12, 0.008474252977167905),
 (13, 0.01254675160403836),
 (14, 0.006939821977185212),
 (15, 0.004160164353658544),
 (16, 0.009723152301757276),
 (17, 0.007246311724914886),
 (18, 0.0023223690073796697),
 (19, 0.003113475078513653),
 (20, 0.0025897865982028485),
 (21, 0.011509766456234131),
 (22, 0.0010543106308832434),
 (23, 0.0019521020096019237),
 (24, 0.0027668858896242898),
 (25, 0.0019193889165329153),
 (26, 0.007695467952349568),
 (27, 0.008971080941027132),
 (28, 0.007728360414274306),
 (29, 0.00486785883367764),
 (30, 0.006334557670976317),
 (31, 0.012750215501682094),
 (32, 0.007306741357430494),
 (33, 0.007076528388803166),
 (34, 0.005741

In [57]:
f_i_sorted = sorted(f_i,key=lambda col: col[1], reverse=True)
f_i_sorted

[(41, 0.021763054183507786),
 (85, 0.02107400897863142),
 (87, 0.018078509881537648),
 (68, 0.017468030109249633),
 (61, 0.017423278511112715),
 (89, 0.016570571009491435),
 (132, 0.016240380932363588),
 (84, 0.014453090116370405),
 (59, 0.014215227215595377),
 (40, 0.013962907123914054),
 (72, 0.013611415467297841),
 (83, 0.012921260086520787),
 (131, 0.012874990784237185),
 (31, 0.012750215501682094),
 (67, 0.0125987964115798),
 (13, 0.01254675160403836),
 (49, 0.012320761833438332),
 (125, 0.011993883792448779),
 (46, 0.011936125460039741),
 (52, 0.011707683866819234),
 (58, 0.011554753326782814),
 (21, 0.011509766456234131),
 (88, 0.010894657854622148),
 (69, 0.010746751393590446),
 (57, 0.010678108926619254),
 (63, 0.010359084936846808),
 (62, 0.010358260320006873),
 (75, 0.01035491796665291),
 (55, 0.010325792227661048),
 (127, 0.010030799300285435),
 (45, 0.009943417672720823),
 (120, 0.009906205331422613),
 (134, 0.009848816969530206),
 (16, 0.009723152301757276),
 (118, 0.0096

## Recursive feature elemination - Random forest  <a name="Recursive_feature_elemination_Random_forest"></a>

In [58]:
def RFE_RF(X_train, y_train, X_test, y_test, n_feature):
    
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    
    # Random forest classifier estimator
    rf = RandomForestClassifier(random_state=0, n_jobs=-1)
    
    # RFE model
    rfe = RFE(estimator=rf, n_features_to_select=n_feature, step=1, verbose=1)
    rfe.fit_transform(X_train_scl, y_train)
    
    # Reduce X to the selected features.
    X_train_reduce = rfe.transform(X_train)
    X_test_reduce = rfe.transform(X_test)
    
    rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = random_forest_classifier(X_train_reduce, y_train, X_test_reduce, y_test)
    return rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

In [59]:
rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = RFE_RF(X_train, y_train, X_test, y_test, 390)

classes model: 
[1 2]
[[0.34 0.66]
 [0.48 0.52]
 [0.39 0.61]
 [0.43 0.57]
 [0.39 0.61]
 [0.36 0.64]
 [0.5  0.5 ]
 [0.39 0.61]
 [0.47 0.53]
 [0.42 0.58]
 [0.48 0.52]
 [0.4  0.6 ]
 [0.43 0.57]
 [0.45 0.55]
 [0.38 0.62]
 [0.5  0.5 ]
 [0.41 0.59]
 [0.46 0.54]
 [0.35 0.65]
 [0.49 0.51]
 [0.51 0.49]
 [0.4  0.6 ]
 [0.37 0.63]
 [0.45 0.55]
 [0.29 0.71]
 [0.38 0.62]
 [0.47 0.53]]


In [60]:
print("auc_score = "+ str(auc))

auc_score = 0.3664772727272727


In [61]:
confusion

array([[ 1, 10],
       [ 2, 14]], dtype=int64)

In [62]:
confusion.ravel()

array([ 1, 10,  2, 14], dtype=int64)

In [63]:
confusion

array([[ 1, 10],
       [ 2, 14]], dtype=int64)

In [64]:
features = np.arange(0, X_train.shape[1], 1, dtype=int)
selected_features = np.array(features)[rfe.get_support()]
selected_features

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135])

Find number feature to select is the optimal value

In [65]:
def optimal_number_feature_to_select(min_feature, max_feature, sequence=50, savelink = "./output/n_feature_to_select.txt"):
    auc_max = 0
    number_feature_to_select = min_feature
    text_file = open(savelink, "a")

    for i in range(min_feature, max_feature, sequence):
        text = "n_features_to_select = " + str(i) + " \n"
        n = text_file.writelines(text)

        rfe_rf, rf_reduce, auc_score_rf_reduce = RFE_RF(X_train, y_train, X_test, y_test, i)
        text = "auc = " + str(auc_score_rf_reduce) + " \n"
        n = text_file.writelines(text)
        
        if(auc_max < auc_score_rf_reduce):
            auc_max = auc_score_rf_reduce
            number_feature_to_select = i
    
    text_file.close()
    return number_feature_to_select, auc_max

In [66]:
# number_feature_to_select, auc_max = optimal_number_feature_to_select(100, 600)

In [67]:
number_feature_to_select = 390

In [68]:
rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = RFE_RF(X_train, y_train, X_test, y_test, number_feature_to_select)

classes model: 
[1 2]
[[0.34 0.66]
 [0.48 0.52]
 [0.39 0.61]
 [0.43 0.57]
 [0.39 0.61]
 [0.36 0.64]
 [0.5  0.5 ]
 [0.39 0.61]
 [0.47 0.53]
 [0.42 0.58]
 [0.48 0.52]
 [0.4  0.6 ]
 [0.43 0.57]
 [0.45 0.55]
 [0.38 0.62]
 [0.5  0.5 ]
 [0.41 0.59]
 [0.46 0.54]
 [0.35 0.65]
 [0.49 0.51]
 [0.51 0.49]
 [0.4  0.6 ]
 [0.37 0.63]
 [0.45 0.55]
 [0.29 0.71]
 [0.38 0.62]
 [0.47 0.53]]


In [69]:
thresholds

array([1.71, 0.71, 0.66, 0.64, 0.63, 0.62, 0.61, 0.6 , 0.59, 0.58, 0.55,
       0.54, 0.52, 0.51, 0.5 , 0.49])

In [70]:
len(thresholds)

16

In [71]:
tpr

array([0.    , 0.    , 0.    , 0.125 , 0.125 , 0.125 , 0.25  , 0.375 ,
       0.375 , 0.4375, 0.6875, 0.6875, 0.8125, 0.875 , 0.9375, 1.    ])

In [72]:
len(tpr)

16

In [73]:
fpr

array([0.        , 0.09090909, 0.18181818, 0.18181818, 0.27272727,
       0.45454545, 0.54545455, 0.54545455, 0.63636364, 0.63636364,
       0.63636364, 0.72727273, 0.90909091, 0.90909091, 1.        ,
       1.        ])

In [74]:
len(fpr)

16

In [75]:
confusion

array([[ 1, 10],
       [ 2, 14]], dtype=int64)

In [76]:
# n_feature = []
# auc_arr = []
# accuracy_arr = []
# recall_arr = []
# for i in range(50, 550, 50):
#     number_feature_to_select = i
#     rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall,confusion = RFE_RF(X_train, y_train, X_test, y_test, number_feature_to_select)
#     print(i)
#     n_feature.append(i)
#     auc_arr.append(auc)
#     accuracy_arr.append(accuracy)
#     recall_arr.append(recall)

In [77]:
# print(n_feature)

In [78]:
# print(auc_arr)

In [79]:
# print(accuracy_arr)

In [80]:
# print(recall_arr)

In [81]:
# features = np.arange(0, X_train.shape[1], 1, dtype=int)
# selected_features = np.array(features)[rfe.get_support()]
# selected_features