# Tutorial 5: RFE on the best 400 features

---

### Introduction

Hello, this notebook will show how to use RFE to perform further feature selection, as we found that most features are very highly correlated, therfore we would need to remove those using the tool that we mentioned above. 

first, let us call the data

In [1]:
%store -r df_input_GD
%store -r y_GD
%store -r df_GD

In [2]:
y_GD = y_GD.map({'S': 1, 'B': 0})

---

calling some packages

In [3]:
# Python packages 
import pandas as pd # for importing data into data frame format
import seaborn as sns # For drawing useful graphs, such as bar graphs
import numpy as np
import matplotlib.pyplot as plt

---

<b><i> Data splitting </i></b> 

In [4]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split( df_input_GD, y_GD, test_size = 0.3, random_state=3, stratify=y_GD) # train and valid sets

In [5]:
print(Xtrain.shape)
print(Xtest.shape)

(382, 2074)
(165, 2074)


---

<b><i> get the best N features </i></b> 

In [6]:
%store -r featImp_GD

In [7]:
arrimp = np.array(featImp_GD).mean(0)
sorted_idx = arrimp.argsort()
print(sorted_idx[-10:])

[ 15 205 144 153  71 180 157  60  43  86]


In [12]:
cols_ordered = df_input_GD.columns[sorted_idx][-200:]
cols_ordered

Index([ 837.416, 1365.266, 1399.159,  872.355, 1165.232,  866.235, 1129.691,
         909.06, 1235.179, 2126.859,
       ...
        837.687,  892.475,  874.121,  876.781,  853.123,   884.86,  877.969,
        850.046,  845.334,  857.355],
      dtype='object', length=200)

---

In [13]:
Xtrain[cols_ordered]

Unnamed: 0,837.416,1365.266,1399.159,872.355,1165.232,866.235,1129.691,909.060,1235.179,2126.859,...,837.687,892.475,874.121,876.781,853.123,884.860,877.969,850.046,845.334,857.355
228,-1.041473,-1.133095,-0.879713,-1.029319,-1.154610,-1.029357,-1.096580,-1.044207,-1.146433,-0.808282,...,-1.040947,-1.035275,-1.030146,-1.033481,-1.032521,-1.034466,-1.029368,-1.031805,-1.039755,-1.031936
474,1.209387,1.321814,1.210382,1.223085,1.335260,1.224247,1.305281,1.253120,1.340066,1.179647,...,1.210396,1.244742,1.230255,1.231594,1.220015,1.233132,1.230056,1.216652,1.215678,1.224416
273,1.125499,1.110201,1.072964,1.140489,1.149725,1.143608,1.156840,1.143695,1.144796,0.898282,...,1.127660,1.141946,1.144145,1.144791,1.137959,1.144632,1.144275,1.139449,1.133384,1.144581
285,1.020646,0.614146,0.456294,1.014547,0.782600,1.016451,0.896602,0.993650,0.789494,0.362218,...,1.017840,1.000981,1.012044,1.010621,1.020728,1.009804,1.008720,1.020247,1.022598,1.018547
20,-0.799006,-0.336132,0.045062,-0.788769,-0.623451,-0.796137,-0.731589,-0.783391,-0.672839,0.001079,...,-0.797608,-0.788356,-0.792682,-0.789674,-0.799611,-0.786075,-0.790533,-0.802299,-0.794386,-0.800824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,1.242951,1.318137,1.345704,1.253325,1.291301,1.249825,1.278739,1.263841,1.284864,1.430807,...,1.240678,1.262327,1.258593,1.259826,1.246070,1.264388,1.259422,1.246331,1.244710,1.247542
71,-0.982532,-0.803381,-0.311417,-0.965638,-0.970908,-0.971229,-0.970869,-0.960057,-0.998299,-0.170533,...,-0.981073,-0.963380,-0.965990,-0.963353,-0.978024,-0.959486,-0.964945,-0.978210,-0.975422,-0.977062
287,1.091043,1.032935,0.574451,1.094709,1.188758,1.095219,1.169332,1.104186,1.182469,0.344371,...,1.088064,1.103508,1.098137,1.097991,1.090781,1.102332,1.098439,1.096154,1.096031,1.091843
66,-1.292104,-1.619209,-1.706416,-1.277919,-1.487857,-1.281625,-1.351914,-1.286631,-1.470924,-1.705711,...,-1.292153,-1.282100,-1.279279,-1.279909,-1.284921,-1.277541,-1.281256,-1.285772,-1.282308,-1.285889


### RFE

In [14]:
# explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression

In [None]:

def get_models():
    models = dict()
    for i in range(2, 51):
        rfe = RFE(estimator=LogisticRegression(solver = 'newton-cg'), n_features_to_select=i, verbose = 0)
        model = LogisticRegression(solver = 'newton-cg')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=50, random_state=1)
    # cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
    
    scores = cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, Xtrain[cols_ordered[-400:]].values, Ytrain)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
    


In [None]:
# plot model performance for comparison
plt.figure(figsize=(15, 7))
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()