# **Recursive Feature Elimination with Support vector machine**
 ## Table of contents
>1. [Dataset](#Dataset)
>2. [Support Vector Classifier](#support_vector_classifier)
>3. [Recursive feature elemination - Support vector machine](#Recursive_feature_elemination_Support_vector_machine)

## Library

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm

from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn import metrics

## Dataset <a name="Dataset"></a>

In [14]:
X_train = pd.read_csv('./data/X_train.csv').to_numpy()[1:,1:]
y_train = pd.read_csv('./data/y_train.csv').to_numpy()[1:,1:].astype('int')
X_test = pd.read_csv('./data/X_test.csv').to_numpy()[1:,1:]
y_test = pd.read_csv('./data/y_test.csv').to_numpy()[1:,1:].astype('int')

In [15]:
X_train.shape

(110, 136)

## Support Vector Classifier <a name="support_vector_classifier"></a>

In [16]:
def support_vector_classifier(X_train, y_train, X_test, y_test):
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()
    
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)

    # SVM model
    clf = svm.SVC(kernel="linear",probability=True)
    clf.fit(X_train_scl, y_train)
    
    #predict testing dataset
    y_pred = clf.predict(X_test)
    
    # Predict probability (0, 1) 
    y_score = clf.predict_proba(X_test)
    
    # ROC curve: receiver operating characteristic curve
    # AUC: Area Under the ROC Curve 
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,1], pos_label=2)
    auc = metrics.auc(fpr, tpr)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
    confusion = (tn, fp, fn, tp)
    
    return clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

## Recursive feature elemination - Support vector machine  <a name="Recursive_feature_elemination_Support_vector_machine"></a>

In [17]:
def RFE_SVM(X_train, y_train, X_test, y_test, n_feature):
    
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    
    # SVM estimator
    SVM = SVC(kernel="linear", probability=True)

    # SVM model
    rfe = RFE(estimator=SVM, n_features_to_select=n_feature, step=1, verbose=1)
    rfe.fit_transform(X_train_scl, y_train)
    
    # Reduce X to the selected features.
    X_train_reduce = rfe.transform(X_train)
    X_test_reduce = rfe.transform(X_test)
    
    clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = support_vector_classifier(X_train_reduce, y_train, X_test_reduce, y_test)    
    
    return rfe, clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

In [18]:
rfe, clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = RFE_SVM(X_train, y_train, X_test, y_test, 170)

In [19]:
features = np.arange(0, X_train.shape[1], 1, dtype=int)
selected_features = np.array(features)[rfe.get_support()]
selected_features

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135])

In [20]:
auc

0.4034090909090909

In [21]:
tpr

array([0.    , 0.    , 0.0625, 0.0625, 0.1875, 0.1875, 0.375 , 0.375 ,
       0.5   , 0.5   , 0.75  , 0.75  , 0.9375, 0.9375, 1.    ])

In [22]:
fpr

array([0.        , 0.09090909, 0.09090909, 0.18181818, 0.18181818,
       0.45454545, 0.45454545, 0.54545455, 0.54545455, 0.72727273,
       0.72727273, 0.90909091, 0.90909091, 1.        , 1.        ])

In [23]:
print(confusion)

(1, 10, 2, 14)


In [24]:
# n_feature = []
# auc_arr = []
# accuracy_arr = []
# recall_arr = []
# for i in range(50, 550, 50):
#     number_feature_to_select = i
#     rfe, clf, fpr, tpr, thresholds, auc, accuracy, recall = RFE_SVM(X_train, y_train, X_test, y_test, number_feature_to_select)
#     print(i)
#     n_feature.append(i)
#     auc_arr.append(auc)
#     accuracy_arr.append(accuracy)
#     recall_arr.append(recall)

In [25]:
# print(n_feature)

In [26]:
# print(auc_arr)

In [27]:
# print(accuracy_arr)

In [28]:
# print(recall_arr)