In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, LogisticRegression,
                                  PassiveAggressiveClassifier, RidgeClassifier)
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (BaggingClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, HistGradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (train_test_split, ShuffleSplit, StratifiedShuffleSplit, cross_val_score, cross_val_predict,
                                     KFold, StratifiedKFold, RandomizedSearchCV, GridSearchCV, learning_curve)
from sklearn.metrics import (mean_squared_error, accuracy_score, balanced_accuracy_score, classification_report, roc_auc_score,
                             confusion_matrix, precision_score, recall_score, f1_score)
import warnings
from sklearn.utils import resample
!pip install Boruta
!pip install sklearn_relief
!pip install --upgrade category_encoders
!pip install git+https://github.com/smazzanti/mrmr
warnings.filterwarnings("ignore")
plt.rcdefaults()
random_state = 42

In [None]:
class FS_and_CLF:
    def __init__(self, excel_file, ml_algo_list, fs_algo_list, sampling_types=['bootstrap', 'holdout']):
        self.data = pd.read_excel(excel_file)
        self.X = self.data.drop(['label'], axis=1)
        self.y = self.data.label
        scaler = MinMaxScaler()
        self.X = scaler.fit_transform(self.X)
        self.ml_algo_list = ml_algo_list
        self.fs_algo_list = fs_algo_list
        self.sampling_types = sampling_types
        self.result_df = pd.DataFrame(columns=['ML','FS', 'Sampling', 'AUC', 'ACC', 'SEN', 'SPE', 'PPV', 'NPV'])
         
    def bootstrap_sampling(self):
        values = self.data.values
        n_iterations = 10
        n_size = int(len(self.data) * 0.50)
        stats = list()
        for i in range(n_iterations):
            train = resample(values, n_samples = n_size)
            test = np.array([x for x in values if x.tolist() not in train.tolist()])
        X_train, y_train = train[:,:-1], train[:,-1]
        X_test, y_test = test[:,:-1], test[:,-1]
        return X_train, X_test , y_train , y_test
        
    def holdout_spliting(self):
        X_train, X_test , y_train , y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=random_state)
        y_train = y_train.ravel()
        y_test = y_test.ravel()
        return X_train, X_test , y_train , y_test
    
    def calc_metrics(self, y_true, y_pred):
        auc = roc_auc_score(y_true, y_pred)
        acc = accuracy_score(y_true, y_pred)
        confusion = confusion_matrix(y_true, y_pred)
        TP = confusion[1,1]
        TN = confusion[0,0]
        FP = confusion[0,1]
        FN = confusion[1,0]
        SEN = TP / (TP + FN)
        SPE = TN / (TN + FP)
        PPV = TP / (TP + FP)
        NPV = TN / (TN + FN)
        return auc, acc, SEN, SPE, PPV, NPV
    
    def boruta(self, X_train, X_test , y_train , y_test):
        from boruta import BorutaPy
        X_train = pd.DataFrame(X_train)
        y_train = pd.DataFrame(y_train)
        X_test = pd.DataFrame(X_test)
        y_test = pd.DataFrame(y_test)
        rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
        boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
        boruta_selector.fit(np.array(X_train), np.array(y_train))
        X_new_train = boruta_selector.transform(np.array(X_train))
        X_new_test = boruta_selector.transform(np.array(X_test))
        for ml_algo in self.ml_algo_list: 
            boruto_clf = ml_algo()
            boruto_clf.fit(X_new_train, y_train)
            y_pred = boruto_clf.predict(X_new_test)
            auc, acc, SEN, SPE, PPV, NPV = self.calc_metrics(y_test, y_pred)
            result_data = {'ML':ml_algo.__name__, 'FS':'Boruto', 'Sampling':self.sampling_type,
                           'AUC':auc, 'ACC':acc, 'SEN':SEN, 'SPE':SPE, 'PPV':PPV, 'NPV':NPV}
            self.result_df = self.result_df.append(result_data, ignore_index=True)
    
    def k_best(self, X_train, X_test , y_train , y_test):
        from sklearn.feature_selection import SelectKBest, f_classif
        from sklearn import set_config
        set_config(display="diagram")
        nof_list=np.arange(1, 20)            
        high_score = 0
        nof = 0           
        score_list =[]
        for ml_algo in self.ml_algo_list:
            for n in nof_list:
                fs = SelectKBest(f_classif, k=nof_list[n])
                relief = Pipeline([('fs', fs), ('m', ml_algo())])
                relief.fit(X_train, y_train)
                score = relief.score(X_test, y_test)
                score_list.append(score)
                if(score > high_score):
                    high_score = score
                    nof = nof_list[n]
            fs = SelectKBest(f_classif, k=nof)
            relief = Pipeline([('fs', fs), ('m', ml_algo())])
            relief.fit(X_train, y_train)
            y_pred = relief.predict(X_test)
            auc, acc, SEN, SPE, PPV, NPV = self.calc_metrics(y_test, y_pred)
            result_data = {'ML':ml_algo.__name__, 'FS':'k_best', 'Sampling':self.sampling_type,
                           'AUC':auc, 'ACC':acc, 'SEN':SEN, 'SPE':SPE, 'PPV':PPV, 'NPV':NPV}
            self.result_df = self.result_df.append(result_data, ignore_index=True)
      
    def RReliefF(self, X_train, X_test , y_train , y_test):
        import sklearn_relief as sr
        nof_list=np.arange(1, 20)            
        high_score = 0
        nof = 0           
        score_list =[]
        for ml_algo in self.ml_algo_list:
            for n in nof_list:
                fs = sr.RReliefF(n_features = nof_list[n])
                relief = Pipeline([('fs', fs), ('m', ml_algo())])
                relief.fit(X_train, y_train)
                score = relief.score(X_test, y_test)
                score_list.append(score)
                if(score > high_score):
                    high_score = score
                    nof = nof_list[n]
            fs = sr.RReliefF(n_features = nof)
            relief = Pipeline([('fs', fs), ('m', ml_algo())])
            relief.fit(X_train, y_train)
            y_pred = relief.predict(X_test)
            auc, acc, SEN, SPE, PPV, NPV = self.calc_metrics(y_test, y_pred)
            result_data = {'ML':ml_algo.__name__, 'FS':'RReliefF', 'Sampling':self.sampling_type,
                           'AUC':auc, 'ACC':acc, 'SEN':SEN, 'SPE':SPE, 'PPV':PPV, 'NPV':NPV}
            self.result_df = self.result_df.append(result_data, ignore_index=True)
            
    def RFE(self, X_train, X_test , y_train , y_test):
        from sklearn.feature_selection import RFE
        nof_list=np.arange(1, 2)            
        high_score = 0
        nof = 0           
        score_list =[]
        for ml_algo in self.ml_algo_list:
            if not(ml_algo.__name__=='BaggingClassifier' or ml_algo.__name__=='GaussianNB'
                  or ml_algo.__name__=='HistGradientBoostingClassifier' or
                   ml_algo.__name__=='KNeighborsClassifier' or ml_algo.__name__=='MLPClassifier'
                  or ml_algo.__name__=='NearestCentroid' or
                   ml_algo.__name__=='QuadraticDiscriminantAnalysis' or ml_algo.__name__=='SVC'):
                for n in nof_list:
                    fs = RFE(ml_algo(), n_features_to_select = n)
                    print("step 1 " , n)
                    relief = Pipeline([('fs', fs), ('m', ml_algo())])
                    relief.fit(X_train, y_train)
                    print("step 2 ", n)
                    score = relief.score(X_test, y_test)
                    score_list.append(score)
                    if(score > high_score):
                        high_score = score
                        nof = n
                fs = RFE(ml_algo(), n_features_to_select = nof)
                relief = Pipeline([('fs', fs), ('m', ml_algo())])
                relief.fit(X_train, y_train)
                y_pred = relief.predict(X_test)
                auc, acc, SEN, SPE, PPV, NPV = self.calc_metrics(y_test, y_pred)
                result_data = {'ML':ml_algo.__name__, 'FS':'RFE', 'Sampling':self.sampling_type,
                               'AUC':auc, 'ACC':acc, 'SEN':SEN, 'SPE':SPE, 'PPV':PPV, 'NPV':NPV}
                self.result_df = self.result_df.append(result_data, ignore_index=True)     
    
    def mRmR(self, X_train, X_test , y_train , y_test):
        from mrmr import mrmr_classif
        nof_list=np.arange(2, 20)            
        high_score = 0       
        score_list = []
        best_faetures = []
        for ml_algo in self.ml_algo_list:
            for n in nof_list:
                selected_features = mrmr_classif(pd.DataFrame(X_train), pd.Series(y_train), K=n)
                ml = ml_algo()
                ml.fit(X_train[:, selected_features], y_train)
                y_pred = ml.predict(X_test[:, selected_features])
                score = accuracy_score(y_test, y_pred)
                if(score > high_score):
                    high_score = score
                    best_faetures = selected_features
                ml.fit(X_train[:, best_faetures], y_train)    
                y_pred = ml.predict(X_test[:, best_faetures])
                auc, acc, SEN, SPE, PPV, NPV = self.calc_metrics(y_test, y_pred)
                result_data = {'ML':ml_algo.__name__, 'FS':'mRmR', 'Sampling':self.sampling_type,
                           'AUC':auc, 'ACC':acc, 'SEN':SEN, 'SPE':SPE, 'PPV':PPV, 'NPV':NPV}
            self.result_df = self.result_df.append(result_data, ignore_index=True)    

    def main(self):
        for sampling_type in self.sampling_types:
            if sampling_type == 'bootstrap':
                self.sampling_type = 'bootstrap'
                X_train, X_test , y_train , y_test = self.bootstrap_sampling()
            elif sampling_type == 'holdout':
                self.sampling_type = 'holdout'
                X_train, X_test , y_train , y_test = self.holdout_spliting()
            for fs_algo in self.fs_algo_list:
                if fs_algo == 'boruto':
                    self.boruta(X_train, X_test , y_train , y_test)
                elif fs_algo == 'k_best':
                    self.k_best(X_train, X_test , y_train , y_test)   
                elif fs_algo == 'RReliefF':
                    self.RReliefF(X_train, X_test , y_train , y_test)   
                elif fs_algo == 'RFE':
                    self.RFE(X_train, X_test , y_train , y_test)   
                elif fs_algo == 'mRmR':
                    self.mRmR(X_train, X_test , y_train , y_test)
        self.result_df.to_excel(str(Path().absolute())+'\\result.xlsx', index=False)  

In [None]:
file_path = 'Manual.xlsx'
ml_algo_list = [AdaBoostClassifier,
                BaggingClassifier,
                BernoulliNB,
                DecisionTreeClassifier,
                ExtraTreesClassifier,
                GaussianNB,
                GradientBoostingClassifier,
                HistGradientBoostingClassifier,
                KNeighborsClassifier,
                LinearDiscriminantAnalysis,
                LogisticRegression,
                MLPClassifier,
                NearestCentroid,
                PassiveAggressiveClassifier,
                QuadraticDiscriminantAnalysis,
                RandomForestClassifier,
                RidgeClassifier,
                SVC
               ]

fs_algo_list = ['boruto', 'k_best', 'RReliefF', 'RFE', 'mRmR']

clf = FS_and_CLF(file_path, ml_algo_list, ['RFE'])
clf.main()