# **Recursive Feature Elimination with Random Forest Selecting features for model performance**

 ## Table of contents
>1. [Dataset](#Dataset)
>2. [Random Forest Classifier](#Random_Forest_Classifier)
>3. [Recursive feature elemination - Random forest](#Recursive_feature_elemination_Random_forest)


## Library

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn import metrics

## Dataset <a name="Dataset"></a>

In [35]:
X_train = pd.read_csv('../data/dataset/NoQC/X_train.csv').to_numpy()
y_train = pd.read_csv('../data/dataset/NoQC/y_train.csv').to_numpy()
X_test = pd.read_csv('../data/dataset/NoQC/X_test.csv').to_numpy()
y_test = pd.read_csv('../data/dataset/NoQC/y_test.csv').to_numpy()

In [36]:
#check dataset
print("Training data: X_train = ",X_train.shape, "y_train = ", y_train.shape)
print("Testing data: X_test = ",X_test.shape, "y_test = ", y_test.shape)

Training data: X_train =  (111, 605) y_train =  (111, 1)
Testing data: X_test =  (28, 605) y_test =  (28, 1)


In [37]:
y_test

array([[1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [1],
       [2],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [2],
       [1],
       [1],
       [1]], dtype=int64)

## Random Forest Classifier <a name="Random_Forest_Classifier"></a>

In [38]:
def random_forest_classifier(X_train, y_train, X_test, y_test):
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()   
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    X_test_scl = scaler.fit_transform(X_test)
    
    # convert matrix 2d -> 1d (by flatten method)    
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    
    # Random forest classifier model
    rf = RandomForestClassifier(random_state=0, n_jobs=-1)
    rf.fit(X_train_scl, y_train)
        
    # check classes
    print("classes model: ")
    print(rf.classes_)
    
    # Predict testing dataset
    y_pred = rf.predict(X_test)
    # Predict probability (1, 2) 
    y_score = rf.predict_proba(X_test)
    print(y_score)
    
    # ROC curve: receiver operating characteristic curve
    # AUC: Area Under the ROC Curve 
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,1], pos_label=2)
    auc = metrics.auc(fpr, tpr)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
#     tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
#     confusion = (tn, fp, fn, tp)
    confusion = metrics.confusion_matrix(y_test, y_pred)
    return rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion 

In [39]:
%%time
# check efficiency random forest classifier for all dataset (train, test)
rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = random_forest_classifier(X_train, y_train, X_test, y_test)

classes model: 
[1 2]
[[0.59 0.41]
 [0.54 0.46]
 [0.62 0.38]
 [0.48 0.52]
 [0.59 0.41]
 [0.62 0.38]
 [0.57 0.43]
 [0.65 0.35]
 [0.53 0.47]
 [0.56 0.44]
 [0.53 0.47]
 [0.61 0.39]
 [0.44 0.56]
 [0.51 0.49]
 [0.7  0.3 ]
 [0.5  0.5 ]
 [0.55 0.45]
 [0.49 0.51]
 [0.56 0.44]
 [0.42 0.58]
 [0.69 0.31]
 [0.66 0.34]
 [0.6  0.4 ]
 [0.51 0.49]
 [0.64 0.36]
 [0.45 0.55]
 [0.53 0.47]
 [0.62 0.38]]
CPU times: total: 172 ms
Wall time: 125 ms


In [40]:
print("auc_score = "+ str(auc))

auc_score = 0.38802083333333337


In [41]:
feature_importance = rf.feature_importances_
features_index = np.arange(0, X_train.shape[1], 1, dtype=int)
f_i = list(zip(features_index, feature_importance))
f_i

[(0, 0.0),
 (1, 8.47975553857907e-05),
 (2, 0.0),
 (3, 0.0002404158544509422),
 (4, 0.0),
 (5, 0.00037525354969574165),
 (6, 0.0),
 (7, 0.004526001999516245),
 (8, 0.001857925037690587),
 (9, 0.0003629978684550598),
 (10, 0.004399380603239685),
 (11, 0.00204334628154042),
 (12, 0.0038685054970028247),
 (13, 0.001143099004319998),
 (14, 0.0019266384822382418),
 (15, 0.00889783928726194),
 (16, 0.0026729969656460882),
 (17, 0.004505636562028702),
 (18, 0.0),
 (19, 0.002060010009764487),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.004581500797855939),
 (24, 0.0068979842755490395),
 (25, 0.0),
 (26, 0.00449119878325455),
 (27, 0.00029365079365079364),
 (28, 0.0),
 (29, 0.0),
 (30, 0.009011324992868105),
 (31, 0.004146511897527134),
 (32, 0.0),
 (33, 0.001109496564669569),
 (34, 0.00789860705029066),
 (35, 0.0010263924256141389),
 (36, 0.0),
 (37, 0.0),
 (38, 0.006391928747776974),
 (39, 0.0),
 (40, 0.0),
 (41, 0.011665518870973637),
 (42, 0.0004221033868092693),
 (43, 0.0001989247311827958

In [42]:
f_i_sorted = sorted(f_i,key=lambda col: col[1], reverse=True)
f_i_sorted

[(219, 0.02075830652313379),
 (221, 0.01828171532006777),
 (132, 0.017624535977725196),
 (110, 0.01563127582115065),
 (114, 0.014698056176738996),
 (224, 0.012687638224456116),
 (153, 0.012255683808618911),
 (109, 0.012225196054643263),
 (407, 0.012062676445712916),
 (336, 0.012018485553775252),
 (41, 0.011665518870973637),
 (486, 0.011636235265138623),
 (483, 0.01149843369730846),
 (410, 0.011216303120598958),
 (149, 0.011177275014258336),
 (445, 0.01035367946921208),
 (216, 0.010126054202524305),
 (492, 0.009787718749128791),
 (115, 0.009763416806037931),
 (189, 0.0094835563724816),
 (30, 0.009011324992868105),
 (406, 0.00890077154449755),
 (15, 0.00889783928726194),
 (239, 0.00879168097314257),
 (401, 0.008707166244634357),
 (230, 0.008551859914129673),
 (458, 0.008422388240699531),
 (507, 0.008409639719447843),
 (564, 0.008400728564565885),
 (158, 0.008351604565017592),
 (358, 0.008087863802914836),
 (517, 0.008081305160643211),
 (157, 0.008044715496184891),
 (227, 0.00796822189780

## Recursive feature elemination - Random forest  <a name="Recursive_feature_elemination_Random_forest"></a>

In [43]:
def RFE_RF(X_train, y_train, X_test, y_test, n_feature):
    
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    
    # Random forest classifier estimator
    rf = RandomForestClassifier(random_state=0, n_jobs=-1)
    
    # RFE model
    rfe = RFE(estimator=rf, n_features_to_select=n_feature, step=1, verbose=1)
    rfe.fit_transform(X_train_scl, y_train)
    
    # Reduce X to the selected features.
    X_train_reduce = rfe.transform(X_train)
    X_test_reduce = rfe.transform(X_test)
    
    rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = random_forest_classifier(X_train_reduce, y_train, X_test_reduce, y_test)
    return rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

In [44]:
rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = RFE_RF(X_train, y_train, X_test, y_test, 390)

Fitting estimator with 605 features.
Fitting estimator with 604 features.
Fitting estimator with 603 features.
Fitting estimator with 602 features.
Fitting estimator with 601 features.
Fitting estimator with 600 features.
Fitting estimator with 599 features.
Fitting estimator with 598 features.
Fitting estimator with 597 features.
Fitting estimator with 596 features.
Fitting estimator with 595 features.
Fitting estimator with 594 features.
Fitting estimator with 593 features.
Fitting estimator with 592 features.
Fitting estimator with 591 features.
Fitting estimator with 590 features.
Fitting estimator with 589 features.
Fitting estimator with 588 features.
Fitting estimator with 587 features.
Fitting estimator with 586 features.
Fitting estimator with 585 features.
Fitting estimator with 584 features.
Fitting estimator with 583 features.
Fitting estimator with 582 features.
Fitting estimator with 581 features.
Fitting estimator with 580 features.
Fitting estimator with 579 features.
F

In [45]:
print("auc_score = "+ str(auc))

auc_score = 0.6328125


In [46]:
confusion

array([[ 8,  4],
       [12,  4]], dtype=int64)

In [47]:
confusion.ravel()

array([ 8,  4, 12,  4], dtype=int64)

In [48]:
confusion

array([[ 8,  4],
       [12,  4]], dtype=int64)

In [49]:
features = np.arange(0, X_train.shape[1], 1, dtype=int)
selected_features = np.array(features)[rfe.get_support()]
selected_features

array([  7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,
        20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,
        33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
        46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,
        59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
        72,  73,  74,  75,  76,  77,  78,  80,  81,  82,  83,  84,  85,
        86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
        99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 127,
       128, 129, 130, 132, 135, 136, 137, 139, 140, 141, 142, 145, 149,
       151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 166, 169,
       172, 173, 174, 176, 178, 181, 187, 188, 189, 190, 191, 195, 201,
       205, 210, 212, 214, 216, 218, 219, 220, 221, 222, 223, 224, 225,
       226, 227, 228, 230, 232, 233, 234, 235, 236, 237, 238, 23

Find number feature to select is the optimal value

In [50]:
def optimal_number_feature_to_select(min_feature, max_feature, sequence=50, savelink = "./output/n_feature_to_select.txt"):
    auc_max = 0
    number_feature_to_select = min_feature
    text_file = open(savelink, "a")

    for i in range(min_feature, max_feature, sequence):
        text = "n_features_to_select = " + str(i) + " \n"
        n = text_file.writelines(text)

        rfe_rf, rf_reduce, auc_score_rf_reduce = RFE_RF(X_train, y_train, X_test, y_test, i)
        text = "auc = " + str(auc_score_rf_reduce) + " \n"
        n = text_file.writelines(text)
        
        if(auc_max < auc_score_rf_reduce):
            auc_max = auc_score_rf_reduce
            number_feature_to_select = i
    
    text_file.close()
    return number_feature_to_select, auc_max

In [51]:
# number_feature_to_select, auc_max = optimal_number_feature_to_select(100, 600)

In [52]:
number_feature_to_select = 390

In [53]:
rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = RFE_RF(X_train, y_train, X_test, y_test, number_feature_to_select)

Fitting estimator with 605 features.
Fitting estimator with 604 features.
Fitting estimator with 603 features.
Fitting estimator with 602 features.
Fitting estimator with 601 features.
Fitting estimator with 600 features.
Fitting estimator with 599 features.
Fitting estimator with 598 features.
Fitting estimator with 597 features.
Fitting estimator with 596 features.
Fitting estimator with 595 features.
Fitting estimator with 594 features.
Fitting estimator with 593 features.
Fitting estimator with 592 features.
Fitting estimator with 591 features.
Fitting estimator with 590 features.
Fitting estimator with 589 features.
Fitting estimator with 588 features.
Fitting estimator with 587 features.
Fitting estimator with 586 features.
Fitting estimator with 585 features.
Fitting estimator with 584 features.
Fitting estimator with 583 features.
Fitting estimator with 582 features.
Fitting estimator with 581 features.
Fitting estimator with 580 features.
Fitting estimator with 579 features.
F

In [54]:
thresholds

array([1.62, 0.62, 0.58, 0.54, 0.52, 0.51, 0.5 , 0.47, 0.45, 0.44, 0.43,
       0.41, 0.4 , 0.39, 0.38, 0.34])

In [55]:
len(thresholds)

16

In [56]:
tpr

array([0.    , 0.0625, 0.125 , 0.25  , 0.25  , 0.25  , 0.3125, 0.4375,
       0.8125, 0.8125, 0.875 , 0.875 , 0.9375, 0.9375, 1.    , 1.    ])

In [57]:
len(tpr)

16

In [58]:
fpr

array([0.        , 0.        , 0.        , 0.08333333, 0.16666667,
       0.33333333, 0.41666667, 0.41666667, 0.41666667, 0.5       ,
       0.5       , 0.75      , 0.75      , 0.83333333, 0.83333333,
       1.        ])

In [59]:
len(fpr)

16

In [60]:
confusion

array([[ 8,  4],
       [12,  4]], dtype=int64)

In [61]:
# n_feature = []
# auc_arr = []
# accuracy_arr = []
# recall_arr = []
# for i in range(50, 550, 50):
#     number_feature_to_select = i
#     rfe, rf, fpr, tpr, thresholds, auc, accuracy, recall,confusion = RFE_RF(X_train, y_train, X_test, y_test, number_feature_to_select)
#     print(i)
#     n_feature.append(i)
#     auc_arr.append(auc)
#     accuracy_arr.append(accuracy)
#     recall_arr.append(recall)

In [62]:
# print(n_feature)

In [63]:
# print(auc_arr)

In [64]:
# print(accuracy_arr)

In [65]:
# print(recall_arr)

In [66]:
# features = np.arange(0, X_train.shape[1], 1, dtype=int)
# selected_features = np.array(features)[rfe.get_support()]
# selected_features