# **Recursive Feature Elimination with Support vector machine**
 ## Table of contents
>1. [Dataset](#Dataset)
>2. [Support Vector Classifier](#support_vector_classifier)
>3. [Recursive feature elemination - Support vector machine](#Recursive_feature_elemination_Support_vector_machine)

## Library

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm

from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn import metrics

## Dataset <a name="Dataset"></a>

In [19]:
X_train = pd.read_csv('../data/dataset/NoQC/X_train.csv').to_numpy()
y_train = pd.read_csv('../data/dataset/NoQC/y_train.csv').to_numpy()
X_test = pd.read_csv('../data/dataset/NoQC/X_test.csv').to_numpy()
y_test = pd.read_csv('../data/dataset/NoQC/y_test.csv').to_numpy()

In [20]:
X_train.shape

(111, 605)

## Support Vector Classifier <a name="support_vector_classifier"></a>

In [21]:
def support_vector_classifier(X_train, y_train, X_test, y_test):
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()
    
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)

    # SVM model
    clf = svm.SVC(kernel="linear",probability=True)
    clf.fit(X_train_scl, y_train)
    
    #predict testing dataset
    y_pred = clf.predict(X_test)
    
    # Predict probability (0, 1) 
    y_score = clf.predict_proba(X_test)
    
    # ROC curve: receiver operating characteristic curve
    # AUC: Area Under the ROC Curve 
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score[:,1], pos_label=2)
    auc = metrics.auc(fpr, tpr)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
    confusion = (tn, fp, fn, tp)
    
    return clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

## Recursive feature elemination - Support vector machine  <a name="Recursive_feature_elemination_Support_vector_machine"></a>

In [22]:
def RFE_SVM(X_train, y_train, X_test, y_test, n_feature):
    
    # Apply scale datasets
    # create model scale ==> standard scaler
    scaler = StandardScaler()
    # fit dataset to model scale
    X_train_scl = scaler.fit_transform(X_train)
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    
    # SVM estimator
    SVM = SVC(kernel="linear", probability=True)

    # SVM model
    rfe = RFE(estimator=SVM, n_features_to_select=n_feature, step=1, verbose=1)
    rfe.fit_transform(X_train_scl, y_train)
    
    # Reduce X to the selected features.
    X_train_reduce = rfe.transform(X_train)
    X_test_reduce = rfe.transform(X_test)
    
    clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = support_vector_classifier(X_train_reduce, y_train, X_test_reduce, y_test)    
    
    return rfe, clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion

In [23]:
rfe, clf, fpr, tpr, thresholds, auc, accuracy, recall, confusion = RFE_SVM(X_train, y_train, X_test, y_test, 170)

Fitting estimator with 605 features.
Fitting estimator with 604 features.
Fitting estimator with 603 features.
Fitting estimator with 602 features.
Fitting estimator with 601 features.
Fitting estimator with 600 features.
Fitting estimator with 599 features.
Fitting estimator with 598 features.
Fitting estimator with 597 features.
Fitting estimator with 596 features.
Fitting estimator with 595 features.
Fitting estimator with 594 features.
Fitting estimator with 593 features.
Fitting estimator with 592 features.
Fitting estimator with 591 features.
Fitting estimator with 590 features.
Fitting estimator with 589 features.
Fitting estimator with 588 features.
Fitting estimator with 587 features.
Fitting estimator with 586 features.
Fitting estimator with 585 features.
Fitting estimator with 584 features.
Fitting estimator with 583 features.
Fitting estimator with 582 features.
Fitting estimator with 581 features.
Fitting estimator with 580 features.
Fitting estimator with 579 features.
F

Fitting estimator with 378 features.
Fitting estimator with 377 features.
Fitting estimator with 376 features.
Fitting estimator with 375 features.
Fitting estimator with 374 features.
Fitting estimator with 373 features.
Fitting estimator with 372 features.
Fitting estimator with 371 features.
Fitting estimator with 370 features.
Fitting estimator with 369 features.
Fitting estimator with 368 features.
Fitting estimator with 367 features.
Fitting estimator with 366 features.
Fitting estimator with 365 features.
Fitting estimator with 364 features.
Fitting estimator with 363 features.
Fitting estimator with 362 features.
Fitting estimator with 361 features.
Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
Fitting estimator with 354 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
F

In [24]:
features = np.arange(0, X_train.shape[1], 1, dtype=int)
selected_features = np.array(features)[rfe.get_support()]
selected_features

array([  7,   8,  15,  18,  23,  24,  26,  30,  34,  35,  38,  41,  44,
        46,  58,  64,  68,  69,  72,  78,  82,  84,  86, 103, 106, 107,
       108, 110, 112, 114, 115, 116, 118, 129, 132, 135, 136, 142, 154,
       155, 156, 157, 158, 162, 168, 169, 172, 173, 176, 186, 187, 189,
       191, 195, 210, 214, 215, 220, 221, 222, 223, 224, 225, 228, 230,
       231, 233, 234, 236, 237, 238, 239, 240, 241, 242, 243, 244, 246,
       247, 248, 249, 250, 251, 255, 262, 263, 273, 279, 280, 297, 302,
       306, 313, 314, 317, 322, 324, 325, 332, 335, 336, 340, 341, 342,
       344, 345, 347, 354, 358, 360, 361, 366, 367, 369, 375, 376, 380,
       388, 393, 395, 401, 406, 407, 408, 410, 418, 419, 421, 429, 430,
       435, 436, 439, 440, 444, 451, 452, 461, 462, 473, 480, 483, 485,
       486, 492, 494, 506, 509, 511, 515, 516, 517, 533, 534, 535, 545,
       549, 550, 551, 553, 555, 557, 564, 565, 567, 574, 575, 582, 590,
       601])

In [25]:
auc

0.359375

In [26]:
tpr

array([0.    , 0.    , 0.125 , 0.125 , 0.25  , 0.25  , 0.375 , 0.375 ,
       0.4375, 0.4375, 0.5625, 0.5625, 0.8125, 0.8125, 1.    ])

In [27]:
fpr

array([0.        , 0.08333333, 0.08333333, 0.25      , 0.25      ,
       0.33333333, 0.33333333, 0.58333333, 0.58333333, 0.83333333,
       0.83333333, 0.91666667, 0.91666667, 1.        , 1.        ])

In [28]:
print(confusion)

(9, 3, 12, 4)


In [29]:
# n_feature = []
# auc_arr = []
# accuracy_arr = []
# recall_arr = []
# for i in range(50, 550, 50):
#     number_feature_to_select = i
#     rfe, clf, fpr, tpr, thresholds, auc, accuracy, recall = RFE_SVM(X_train, y_train, X_test, y_test, number_feature_to_select)
#     print(i)
#     n_feature.append(i)
#     auc_arr.append(auc)
#     accuracy_arr.append(accuracy)
#     recall_arr.append(recall)

In [30]:
# print(n_feature)

In [31]:
# print(auc_arr)

In [32]:
# print(accuracy_arr)

In [33]:
# print(recall_arr)