# Online Feature selection for all models

### Importing Libraries

In [1]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.ensemble import BalancedRandomForestClassifier # pip install imblearn - scikit wrapper


from helpers import load_data, roc_plt

### Importing data

In [2]:
##load the data 

DATA_PATH = '../data/'
PICKLE_PATH = '../pickles/'
seed = 7

master_data = load_data(DATA_PATH)
X, y = master_data['muchlinski']

In [3]:
def get_params(method):
    file = "gs_rocauc_" + method +"_all.pkl"
    with open(PICKLE_PATH + file, "rb") as f:
        params = pickle.load(f)
        params = params.drop('roc_auc',axis=1).iloc[0].to_dict()
    return params

def save_pkl(object_, file):
    with open(PICKLE_PATH + file + ".pkl", "wb") as f:
        pickle.dump(object_, f, pickle.HIGHEST_PROTOCOL)

### Creating the models

In [4]:
params_brf = get_params("brf")
params_bdt = get_params("bdt")
params_knn = get_params("knn")
params_svm = get_params("svm")

BRF = BalancedRandomForestClassifier(random_state = 0)
KNN = KNeighborsClassifier()
SVM = SVC(random_state = 0)
BDT = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), random_state = 0)

BRF.set_params(**params_brf)
KNN.set_params(**params_knn)
SVM.set_params(**params_svm)
BDT.set_params(**params_bdt)

STD = StandardScaler()

### Retrieving best features using RFE (Recursive Feature Elimination)

In [5]:
clf_feature_sele = BalancedRandomForestClassifier(n_estimators=500, class_weight={0:1,1:8}, max_depth=7) # used best params from cv in other notebook (saved in gs_rocauc_brf_all.pkl)
RFE = RFECV(clf_feature_sele, step=1, scoring='roc_auc')

### Creating the pipelines

In [6]:
pipe_BRF = Pipeline([('std',STD),('rfe',RFE),('clf',BRF)])
pipe_KNN = Pipeline([('std',STD),('rfe',RFE),('clf',KNN)])
pipe_SVM = Pipeline([('std',STD),('rfe',RFE),('clf',SVM)])
pipe_BDT = Pipeline([('std',STD),('rfe',RFE),('clf',BDT)])

### Plottings

In [None]:
brf_res = roc_plt(X, y, pipe_BRF, "Balanced Random Forest ROC-AUC", seed=seed, create_plot=True)
save_pkl(brf_res, "brf_res")

In [None]:
knn_res = roc_plt(X, y, pipe_KNN, "KNN ROC-AUC", seed=seed, create_plot=True)
save_pkl(knn_res, "knn_res")

In [None]:
svm_res = roc_plt(X, y, pipe_SVM, "SVM ROC-AUC", seed=seed, create_plot=True)
save_pkl(svm_res, "svm_res")

In [None]:
bdt_res = roc_plt(X, y, pipe_BDT, "Boosted Decision Tree ROC-AUC", seed=seed, create_plot=True)
save_pkl(bdt_res, "bdt_res")