# Tutorial 4: SFS on all features, (Forward) - (GD)

---

### Introduction

Hello, this notebook will show how to use sequential adding to perform further feature selection, as we found that most features are very highly correlated, therfore we would need to remove those using the tool that we mentioned above. 

first, let us call the data

In [1]:
%store -r df_input_GD
%store -r y_GD
%store -r df_GD

In [2]:
y_GD = y_GD.map({'S': 1, 'B': 0})

---

calling some packages

In [4]:
# Python packages 
import pandas as pd # for importing data into data frame format
import seaborn as sns # For drawing useful graphs, such as bar graphs
import numpy as np
import matplotlib.pyplot as plt

---

<b><i> Data splitting </i></b> 

In [5]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split( df_input_GD, y_GD, test_size = 0.3, random_state=3, stratify=y_GD) # train and valid sets



In [6]:
print(Xtrain.shape)
print(Xtest.shape)

(382, 2074)
(165, 2074)


---

<b><i> get the best N features </i></b> 

In [11]:
%store -r featImp_GD

In [12]:
arrimp = np.array(featImp_GD).mean(0)
sorted_idx = arrimp.argsort()
print(sorted_idx[-10:])

[ 15 205 144 153  71 180 157  60  43  86]


In [13]:
# cols_ordered = df_input_GD.columns[sorted_idx][-400:]
cols_ordered = df_input_GD.columns[sorted_idx]

cols_ordered

Index([1346.127, 1376.867, 1366.706, 1338.483, 1367.427, 1361.681, 1363.113,
       1896.591, 1351.037, 1343.337,
       ...
        837.687,  892.475,  874.121,  876.781,  853.123,   884.86,  877.969,
        850.046,  845.334,  857.355],
      dtype='object', length=2074)

---

In [14]:
Xtrain[cols_ordered]
# there is no need to use the features ordered accoring to the permutation importance

Unnamed: 0,1346.127,1376.867,1366.706,1338.483,1367.427,1361.681,1363.113,1896.591,1351.037,1343.337,...,837.687,892.475,874.121,876.781,853.123,884.860,877.969,850.046,845.334,857.355
228,-1.137450,-1.091696,-1.125850,-1.140855,-1.127334,-1.143758,-1.134605,-0.610720,-1.137249,-1.138493,...,-1.040947,-1.035275,-1.030146,-1.033481,-1.032521,-1.034466,-1.029368,-1.031805,-1.039755,-1.031936
474,1.329481,1.298505,1.319337,1.333125,1.318808,1.326645,1.322175,0.980054,1.327789,1.331324,...,1.210396,1.244742,1.230255,1.231594,1.220015,1.233132,1.230056,1.216652,1.215678,1.224416
273,1.123962,1.104067,1.110786,1.128501,1.108957,1.108938,1.111901,0.785342,1.120333,1.125629,...,1.127660,1.141946,1.144145,1.144791,1.137959,1.144632,1.144275,1.139449,1.133384,1.144581
285,0.664962,0.585824,0.615871,0.682213,0.610793,0.616266,0.619647,0.176442,0.651516,0.670787,...,1.017840,1.000981,1.012044,1.010621,1.020728,1.009804,1.008720,1.020247,1.022598,1.018547
20,-0.438144,-0.243844,-0.333293,-0.477309,-0.326036,-0.350457,-0.349409,0.043870,-0.412553,-0.452798,...,-0.797608,-0.788356,-0.792682,-0.789674,-0.799611,-0.786075,-0.790533,-0.802299,-0.794386,-0.800824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,1.308506,1.322335,1.318760,1.304379,1.320076,1.318194,1.317176,1.279693,1.311268,1.306590,...,1.240678,1.262327,1.258593,1.259826,1.246070,1.264388,1.259422,1.246331,1.244710,1.247542
71,-0.882144,-0.728174,-0.802133,-0.908026,-0.796008,-0.813810,-0.814622,0.101504,-0.862861,-0.891858,...,-0.981073,-0.963380,-0.965990,-0.963353,-0.978024,-0.959486,-0.964945,-0.978210,-0.975422,-0.977062
287,1.101002,0.969751,1.032508,1.123590,1.026415,1.041650,1.042573,0.071304,1.085389,1.108471,...,1.088064,1.103508,1.098137,1.097991,1.090781,1.102332,1.098439,1.096154,1.096031,1.091843
66,-1.595788,-1.653157,-1.625352,-1.579746,-1.624533,-1.608110,-1.615963,-1.656002,-1.601925,-1.590498,...,-1.292153,-1.282100,-1.279279,-1.279909,-1.284921,-1.277541,-1.281256,-1.285772,-1.282308,-1.285889


### SFS

In [15]:
# explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression

In [16]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html
selected = []
imp_ind = []
imp_freq = []

for i in range(1, 51, 1):
    print(i)
    sfs = SequentialFeatureSelector(estimator=LogisticRegression(solver = 'newton-cg'), n_features_to_select=i, direction = 'forward')
    # fit sfs
    sfs.fit(Xtrain[cols_ordered].values, Ytrain)
    # summarize all features
    selected_feat = []
    for i in range(Xtrain[cols_ordered].shape[1]):
        if sfs.support_[i] == True:
            selected_feat.append(i)
            print('Column: %d, Selected %s,' % (i, sfs.support_[i]))
        
    # print()
    selected.append(selected_feat)
    
    result = list(set(imp_ind) ^ set(selected[-1]))
    imp_freq.extend(Xtrain[cols_ordered].columns[result])    
    imp_ind.extend(result)    
    
    print(selected[-1])
    print(result)
    print(imp_ind)
    print(imp_freq)
    print()
    
    

1
Column: 1479, Selected True,
[1479]
[1479]
[1479]
[1393.893]

2
Column: 1285, Selected True,
Column: 1479, Selected True,
[1285, 1479]
[1285]
[1479, 1285]
[1393.893, 864.501]

3
Column: 116, Selected True,
Column: 1285, Selected True,
Column: 1479, Selected True,
[116, 1285, 1479]
[116]
[1479, 1285, 116]
[1393.893, 864.501, 1357.404]

4
Column: 116, Selected True,
Column: 206, Selected True,
Column: 1285, Selected True,
Column: 1479, Selected True,
[116, 206, 1285, 1479]
[206]
[1479, 1285, 116, 206]
[1393.893, 864.501, 1357.404, 1468.086]

5
Column: 116, Selected True,
Column: 127, Selected True,
Column: 206, Selected True,
Column: 1285, Selected True,
Column: 1479, Selected True,
[116, 127, 206, 1285, 1479]
[127]
[1479, 1285, 116, 206, 127]
[1393.893, 864.501, 1357.404, 1468.086, 1836.148]

6
Column: 116, Selected True,
Column: 127, Selected True,
Column: 206, Selected True,
Column: 410, Selected True,
Column: 1285, Selected True,
Column: 1479, Selected True,
[116, 127, 206, 410, 12

---

### LR

In [19]:
lr = LogisticRegression()

In [20]:
solvers = ['newton-cg', 'liblinear']
penalty = ['l2']
c_values = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001]
lr_par = dict(solver=solvers,penalty=penalty,C=c_values)

In [21]:
models = [[lr, 'lr', 14]]
par = [lr_par]

In [22]:
from source.ml_acc import get_accuracy_ml

In [23]:
num_ml_tools = len(par)
ml_dicts = {}

for m, par in zip(models, par):
    key0 = str(m[1])
    ml_dicts[key0] = {}
    for f in selected:
        print(cols_ordered[f])

        xtr =  Xtrain[cols_ordered].iloc[:, f]
        xte =  Xtest[cols_ordered].iloc[:, f]
        print(xtr)
        # results = get_accuracy_ml (m[0], m[2], par, np.array(xtr), np.array(Ytrain), np.array(xte), np.array(Ytest)) # to get the accuracies for the ml model
        results = get_accuracy_ml (m[0], m[2], par, np.array(xtr), np.array(Ytrain), np.array(xtr), np.array(Ytrain)) # to get the accuracies for the ml model
        

        key = str(m[1])+","+str(len(f))
        ml_dicts[key0][key] = {}

        ml_dicts[key0][key]['tot_acc'] = results[0]
        ml_dicts[key0][key]['jack_train'] = results[1]
        ml_dicts[key0][key]['jack_test'] = results[2]

Index([1393.893], dtype='object')
     1393.893
228 -0.939512
474  1.233328
273  1.085518
285  0.489118
20  -0.003712
..        ...
391  1.341821
71  -0.421523
287  0.680304
66  -1.700077
85  -1.050457

[382 rows x 1 columns]
Index([864.501, 1393.893], dtype='object')
      864.501  1393.893
228 -1.031211 -0.939512
474  1.222282  1.233328
273  1.140483  1.085518
285  1.015922  0.489118
20  -0.793335 -0.003712
..        ...       ...
391  1.252882  1.341821
71  -0.969097 -0.421523
287  1.099539  0.680304
66  -1.280278 -1.700077
85  -1.024458 -1.050457

[382 rows x 2 columns]
Index([1357.404, 864.501, 1393.893], dtype='object')
     1357.404   864.501  1393.893
228 -1.136794 -1.031211 -0.939512
474  1.325959  1.222282  1.233328
273  1.115995  1.140483  1.085518
285  0.634315  1.015922  0.489118
20  -0.379853 -0.793335 -0.003712
..        ...       ...       ...
391  1.315436  1.252882  1.341821
71  -0.837064 -0.969097 -0.421523
287  1.063207  1.099539  0.680304
66  -1.608147 -1.280278 -1

---

In [27]:
import json

In [28]:
# with open('ml_gd_sfs.txt', 'w') as file:
#      file.write(json.dumps(ml_dicts)) # use `json.loads` to do the reverse

---