In [None]:
import nibabel as nib
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import glob
import shutil
import seaborn as sns
import sklearn

In [None]:
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import pearsonr

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif,f_classif
from sklearn.model_selection import (KFold, train_test_split, cross_validate, GridSearchCV, RepeatedStratifiedKFold,
                                     cross_val_score, GroupKFold, StratifiedGroupKFold, StratifiedKFold)
from sklearn.metrics import roc_auc_score, recall_score, make_scorer, f1_score, confusion_matrix, roc_curve, accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis                              
from sklearn.tree import DecisionTreeClassifier
from feature_engine.selection import RecursiveFeatureAddition

## Preprocessing Data

In [None]:
# initialize the file paths to hold all the images separated by sequence and SRS Date
root = '/Users/cxl037/PycharmProjects/pythonProject1/Example_Data/'
root_ML = '/Users/cxl037/PycharmProjects/DeepLearning'
model_path = os.path.join(root_ML, 'Models')
if not os.path.exists(model_path):
    os.mkdir(model_path)
PreT1Bravo_path = os.path.join(root, 'PreT1Bravo_Immuno')
PreT1Bravo_batchfile = os.path.join(PreT1Bravo_path, 'radiomics_features3.csv')
PreT1Vasc_path = os.path.join(root, 'PreT1Vasc_Immuno')
PreT1Vasc_batchfile = os.path.join(PreT1Vasc_path, 'radiomics_features.csv')
PreT2Flair_path = os.path.join(root, 'PreT2Flair_Immuno')
PreT2Flair_batchfile = os.path.join(PreT2Flair_path, 'radiomics_features.csv')
Post3MoT1Bravo_path = os.path.join(root, 'Post3MoT1Bravo_Immuno')
Post3MoT1Bravo_batchfile = os.path.join(Post3MoT1Bravo_path, 'radiomics_features.csv')
Post3MoT1Vasc_path = os.path.join(root, 'Post3MoT1Vasc_Immuno')
Post3MoT1Vasc_batchfile = os.path.join(Post3MoT1Vasc_path, 'radiomics_features.csv')
Post3MoT2Flair_path = os.path.join(root, 'Post3MoT2Flair_Immuno')
Post3MoT2Flair_batchfile = os.path.join(Post3MoT2Flair_path, 'radiomics_features.csv')

In [None]:
# Read T1Bravo data and add T1_ prefix to all the feature columns
extractedFeatures = pd.read_csv(PreT1Bravo_batchfile)
extractedFeatures = extractedFeatures.add_prefix('T1_')
extractedFeatures.rename(columns = {'T1_Mask': 'Mask'}, inplace = True)
# Get patientID for future merging
extractedFeatures['PatientID'] = extractedFeatures.apply(lambda row: row['Mask'][:7], axis=1)
extractedFeatures['PatientID'] = extractedFeatures['PatientID'].astype(str)
extractedFeatures

In [None]:
# Read T1Vasc data and add Vasc_ prefix to all the feature columns
extractedFeatures2 = pd.read_csv(PreT1Vasc_batchfile)
extractedFeatures2 = extractedFeatures2.add_prefix('Vasc_')
extractedFeatures2.rename(columns = {'Vasc_Mask': 'Mask'}, inplace = True)
extractedFeatures2

In [None]:
# Read T2Flair data and add T2_ prefix to all the feature columns
extractedFeatures3 = pd.read_csv(PreT2Flair_batchfile)
extractedFeatures3 = extractedFeatures3.add_prefix('T2_')
extractedFeatures3.rename(columns = {'T2_Mask': 'Mask'}, inplace = True)
extractedFeatures3

In [None]:
# Merge all the pre sequences together
allPreFeatures = pd.merge(extractedFeatures, extractedFeatures2, on='Mask', how = 'left' )
allPreFeatures = pd.merge(allPreFeatures, extractedFeatures3, on='Mask', how = 'left' )
# Define label as 0 for pre-SRS features
allPreFeatures['SRS Status'] = 0
allPreFeatures

In [None]:
# Read T1Bravo data and add T1_ prefix to all the feature columns
extractedFeatures4 = pd.read_csv(Post3MoT1Bravo_batchfile)
extractedFeatures4 = extractedFeatures4.add_prefix('T1_')
extractedFeatures4.rename(columns = {'T1_Mask': 'Mask'}, inplace = True)
# Get patientID for future merging
extractedFeatures4['PatientID'] = extractedFeatures4.apply(lambda row: row['Mask'][:7], axis=1)
extractedFeatures4['PatientID'] = extractedFeatures4['PatientID'].astype(str)
extractedFeatures4

In [None]:
# Read T1Vasc data and add Vasc_ prefix to all the feature columns
extractedFeatures5 = pd.read_csv(Post3MoT1Vasc_batchfile)
extractedFeatures5 = extractedFeatures5.add_prefix('Vasc_')
extractedFeatures5.rename(columns = {'Vasc_Mask': 'Mask'}, inplace = True)
extractedFeatures5

In [None]:
# Read T2Flair data and add T2_ prefix to all the feature columns
extractedFeatures6 = pd.read_csv(Post3MoT2Flair_batchfile)
extractedFeatures6 = extractedFeatures6.add_prefix('T2_')
extractedFeatures6.rename(columns = {'T2_Mask': 'Mask'}, inplace = True)
extractedFeatures6

In [None]:
# Merge all the post sequences together
allPostFeatures = pd.merge(extractedFeatures4, extractedFeatures5, on='Mask', how = 'left' )
allPostFeatures = pd.merge(allPostFeatures, extractedFeatures6, on='Mask', how = 'left' )
# Define label as 1 for post-SRS features
allPostFeatures['SRS Status'] = 1
allPostFeatures

In [None]:
# Combine into one dataframe
allFeatures = pd.concat([allPreFeatures, allPostFeatures], ignore_index=True)
allFeatures

In [None]:
# Drop all diagnostic features
remove_cols = [feature for feature in allFeatures.columns 
               if not (feature.startswith("T2_original") or feature.startswith("T1_original") or feature.startswith("Vasc_original") 
               or feature == 'SRS Status' or feature == 'PatientID')]
filteredFeatures = allFeatures.drop(remove_cols, axis = 1)
# remove rows with NA values, only preserving those rows which have information for all sequences
filteredFeatures = filteredFeatures.dropna()
# Get all the subject ids (useful for RepeatedStratifiedGroupKFold)
allSubjects = list(filteredFeatures.loc[:, 'PatientID'])
print(len(set(allSubjects)))
print(len(allSubjects))
filteredFeatures

In [None]:
# Get training data and labels
X = filteredFeatures.drop(['SRS Status', 'PatientID'], axis=1)
y = filteredFeatures['SRS Status']
feature_names = list(X.columns)
X

## Defining Functions and Variables for ML Analysis

In [None]:
class RepeatedStratifiedGroupKFold():
    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
        self.random_state = random_state
        self.n_repeats = n_repeats
        self.n_splits = n_splits

    def split(self, X, y, groups=None):
        tries = 0
        X = pd.DataFrame(X)
        y = pd.DataFrame(y)
        for idx in range(self.n_repeats):
            invalid = True
            # Skip the cv if any of its folds have only one class in the train/test split (causes errors in AUC calculation)
            while (invalid):
                cv = StratifiedGroupKFold(n_splits=self.n_splits, shuffle = True, random_state=self.random_state + tries)
                invalid = False
                tries = tries + 1
                for train_index, test_index in cv.split(X, y, groups):
                    trainPosRatio = y.iloc[train_index].mean().item()
                    testPosRatio = y.iloc[test_index].mean().item()
                    # test for if train fold and test fold only have one class
                    if trainPosRatio == 0 or trainPosRatio == 1 or testPosRatio == 0 or testPosRatio == 1:
                        invalid = True
                        break
                    
            for train_index, test_index in cv.split(X, y, groups):
                yield train_index, test_index

    def get_n_splits(self, X, y, groups=None):
        cv = StratifiedGroupKFold(n_splits=self.n_splits, shuffle = True, random_state=self.random_state)
        return cv.get_n_splits(X, y, groups) * self.n_repeats

In [None]:
# Example RepeatedStratifiedGroupKFold split
print("ORIGINAL POSITIVE RATIO:", y.mean())
train_positive = []
test_positive = []
for fold, (train_idxs, test_idxs) in enumerate(outercv.split(X, y, allSubjects)):
    print("Repeat :", fold // N_REPEATS_O + 1)
    print("Fold :", fold % N_REPEATS_O + 1)
    print("train length:", len(train_idxs))
    print("test length:", len(test_idxs))
    train_positive.append(y.iloc[train_idxs].mean())
    test_positive.append(y.iloc[test_idxs].mean())
    print("TRAIN POSITIVE RATIO:", y.iloc[train_idxs].mean())
    print("TEST POSITIVE RATIO :", y.iloc[test_idxs].mean())
    print("TRAIN GROUPS        :", [allSubjects[i] for i in train_idxs])
    print("TEST GROUPS         :", [allSubjects[i] for i in test_idxs])
print("ALL TRAIN POSITIVE RATIO:", np.mean(train_positive))
print("STD TRAIN POSITIVE RATIO:", np.std(train_positive))
print("ALL TEST POSITIVE RATIO :", np.mean(test_positive))
print("STD TEST POSITIVE RATIO:", np.std(test_positive))

In [None]:
# global constants

N_SPLITS_I = 5
N_REPEATS_I = 5
N_SPLITS_O = 5
N_REPEATS_O = 5

innercv = RepeatedStratifiedGroupKFold(n_splits=N_SPLITS_I, n_repeats=N_REPEATS_I, random_state=30)
outercv = RepeatedStratifiedGroupKFold(n_splits=N_SPLITS_O, n_repeats=N_REPEATS_O, random_state=5)

specificity = make_scorer(recall_score, pos_label=0)
sensitivity = make_scorer(recall_score, pos_label=1)

scoring = {'specificity': specificity,
           'sensitivity': sensitivity,
           'roc_auc': 'roc_auc'
          }

## ML Analysis

In [None]:
def Lasso(X_train, y_train): 
    lasso_grid_pipe = Pipeline([
        ('scalar', StandardScaler()),
        ('clf', LogisticRegression(penalty='l1', solver='liblinear', max_iter=20000))])
    
    lasso_grid_pipe.fit(X_train, y_train)
    return lasso_grid_pipe

In [None]:
def Ridge(X_train, y_train, train_subjects):
    ridge_grid_pipe = Pipeline([
        ('scalar', StandardScaler()),
        ('clf', LogisticRegression(penalty = 'l2', solver='lbfgs',class_weight='balanced', max_iter=10000))])
    
    parameters = {'clf__C':[0.0001, 0.001, 0.01, .1, .5, 1.0, 5, 10, 50, 100],
                  'clf__tol': [0.0001, 0.0005, 0.001, 0.005, 0.01]
                 }
    
    grid_ridge_reg = GridSearchCV(ridge_grid_pipe, parameters, cv=innercv, scoring = 'roc_auc', verbose = 0)
    grid_ridge_reg.fit(X_train, y_train, groups=train_subjects)

    print('Best estimator: ', grid_ridge_reg.best_estimator_)

    ridge_pipe = grid_ridge_reg.best_estimator_

    scores = cross_validate(ridge_pipe, X_train, y_train, groups=train_subjects, cv=innercv, scoring = scoring, n_jobs=-1, return_train_score=True)
        
    print('Cross-Validation Evaluation Scores')

    print('Train AUC:', 'mean-',np.mean(scores['train_roc_auc']), 'std dev-', np.std(scores['train_roc_auc']))
    print('Cross Val AUC:', 'mean-',np.mean(scores['test_roc_auc']), 'std dev-', np.std(scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(scores['test_sensitivity']), 'std dev-', np.std(scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(scores['test_specificity']), 'std dev-', np.std(scores['test_specificity']))

    return grid_ridge_reg.best_params_, grid_ridge_reg.best_estimator_, scores['test_roc_auc'], scores['train_roc_auc']

In [None]:
def Elastic(X_train, y_train, train_subjects): 
    elastic_grid_pipe = Pipeline([
        ('scalar', StandardScaler()),
        ('clf', LogisticRegression(penalty = 'elasticnet', solver = 'saga', max_iter=10000))])

    parameters = {'clf__C': [0.01, 0.1, 0.5, 1.0],
                  'clf__l1_ratio': [.1, .5, .7, .9, .95, .99]}

    elastic_grid = GridSearchCV(elastic_grid_pipe, parameters, cv=innercv, scoring = 'roc_auc', verbose=0, n_jobs=-1)
    elastic_grid.fit(X_train, y_train, groups=train_subjects)

    print('Best estimator: ', elastic_grid.best_estimator_)
    elastic_pipe = elastic_grid.best_estimator_

    scores = cross_validate(elastic_pipe, X_train, y_train, groups=train_subjects, cv=innercv, scoring=scoring, n_jobs=-1, return_train_score=True)
    
    print('Cross-Validation Evaluation Scores')

    print('Train AUC:', 'mean-',np.mean(scores['train_roc_auc']), 'std dev-', np.std(scores['train_roc_auc']))
    print('Test AUC:', 'mean-',np.mean(scores['test_roc_auc']), 'std dev-', np.std(scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(scores['test_sensitivity']), 'std dev-', np.std(scores['test_sensitivity']))
    print('Specificity:', 'xmean-',np.mean(scores['test_specificity']), 'std dev-', np.std(scores['test_specificity']))
    return elastic_grid.best_params_, elastic_grid.best_estimator_, scores['test_roc_auc'], scores['train_roc_auc']

In [None]:
def SVC_linear(X_train, y_train, train_subjects):
    svc_grid_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('clf', LinearSVC(dual=False, max_iter = 20000))])
 
    param_grid = {'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 10, 50],
              'clf__tol': [0.0001, 0.0005, 0.001, 0.005, 0.01]}
        
    grid_svc = GridSearchCV(svc_grid_pipe, param_grid = param_grid, cv=innercv, verbose = 0,n_jobs=-1,scoring = 'roc_auc') 
    grid_svc.fit(X_train, y_train, groups=train_subjects)
    
    print('Best estimator: ', grid_svc.best_estimator_)

    svc_pipe = grid_svc.best_estimator_
    
    svc_scores = cross_validate(svc_pipe, X_train, y_train, groups=train_subjects, scoring=scoring, cv=innercv, n_jobs=-1, return_train_score=True)
    
    print('Cross-Validation Evaluation Scores')

    print('Train AUC:', 'mean-',np.mean(svc_scores['train_roc_auc']), 'std dev-', np.std(svc_scores['train_roc_auc']))
    print('Cross Val AUC:', 'mean-',np.mean(svc_scores['test_roc_auc']), 'std dev-', np.std(svc_scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(svc_scores['test_sensitivity']), 'std dev-', np.std(svc_scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(svc_scores['test_specificity']), 'std dev-', np.std(svc_scores['test_specificity']))

    return grid_svc.best_params_, grid_svc.best_estimator_, svc_scores['test_roc_auc'], svc_scores['train_roc_auc']

In [None]:
def SVC_poly(X_train, y_train, train_subjects):
    svc_grid_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('clf', SVC(kernel='poly', max_iter = 20000))])
 
    param_grid = {'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 10],
                  'clf__gamma': [0.001, 0.01, 0.1, 1, 10]}
    
    grid_svc = GridSearchCV(svc_grid_pipe, param_grid = param_grid, cv=innercv, verbose = 0,n_jobs=-1,scoring = 'roc_auc') 
    grid_svc.fit(X_train, y_train, groups=train_subjects)
    
    print('Best estimator: ', grid_svc.best_estimator_)

    svc_pipe = grid_svc.best_estimator_
    
    svc_scores = cross_validate(svc_pipe, X_train, y_train, groups=train_subjects, scoring=scoring, cv=innercv, n_jobs=-1, return_train_score=True)
    
    print('Cross-Validation Evaluation Scores')

    print('Train AUC:', 'mean-',np.mean(svc_scores['train_roc_auc']), 'std dev-', np.std(svc_scores['train_roc_auc']))
    print('Cross Val AUC:', 'mean-',np.mean(svc_scores['test_roc_auc']), 'std dev-', np.std(svc_scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(svc_scores['test_sensitivity']), 'std dev-', np.std(svc_scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(svc_scores['test_specificity']), 'std dev-', np.std(svc_scores['test_specificity']))

    return grid_svc.best_params_, grid_svc.best_estimator_, svc_scores['test_roc_auc'], svc_scores['train_roc_auc']

In [None]:
def SVC_rbf(X_train, y_train, train_subjects):
    svc_grid_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('clf', SVC(kernel='rbf', max_iter = 20000))])
 
    param_grid = {'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 10, 50],
              'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]}
        
    grid_svc = GridSearchCV(svc_grid_pipe, param_grid = param_grid, cv=innercv, verbose = 0,n_jobs=-1,scoring = 'roc_auc') 
    grid_svc.fit(X_train, y_train, groups=train_subjects)
    
    print('Best estimator: ', grid_svc.best_estimator_)

    svc_pipe = grid_svc.best_estimator_
    
    svc_scores = cross_validate(svc_pipe, X_train, y_train, groups=train_subjects, scoring=scoring, cv=innercv, n_jobs=-1, return_train_score=True)
    
    print('Cross-Validation Evaluation Scores')

    print('Train AUC:', 'mean-',np.mean(svc_scores['train_roc_auc']), 'std dev-', np.std(svc_scores['train_roc_auc']))
    print('Cross Val AUC:', 'mean-',np.mean(svc_scores['test_roc_auc']), 'std dev-', np.std(svc_scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(svc_scores['test_sensitivity']), 'std dev-', np.std(svc_scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(svc_scores['test_specificity']), 'std dev-', np.std(svc_scores['test_specificity']))

    return grid_svc.best_params_, grid_svc.best_estimator_, svc_scores['test_roc_auc'], svc_scores['train_roc_auc']

In [None]:
def KNN(X_train, y_train, train_subjects):
    knn_grid_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('clf', KNeighborsClassifier())])
 
    param_grid = {'clf__n_neighbors': [1, 2, 4, 8, 16, 32, 64],
                  'clf__weights': ['uniform', 'distance']}
        
    grid_knn = GridSearchCV(knn_grid_pipe, param_grid = param_grid, cv=innercv, verbose = 0,n_jobs=-1,scoring = 'roc_auc') 
    grid_knn.fit(X_train, y_train, groups=train_subjects)
    
    print('Best estimator: ', grid_knn.best_estimator_)

    knn_pipe = grid_knn.best_estimator_
    
    scores = cross_validate(knn_pipe, X_train, y_train, groups=train_subjects, scoring=scoring, cv=innercv, n_jobs=-1, return_train_score=True)
    
    print('Cross-Validation Evaluation Scores')

    print('Train AUC:', 'mean-',np.mean(scores['train_roc_auc']), 'std dev-', np.std(scores['train_roc_auc']))
    print('Cross Val AUC:', 'mean-',np.mean(scores['test_roc_auc']), 'std dev-', np.std(scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(scores['test_sensitivity']), 'std dev-', np.std(scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(scores['test_specificity']), 'std dev-', np.std(scores['test_specificity']))

    return grid_knn.best_params_, grid_knn.best_estimator_, scores['test_roc_auc'], scores['train_roc_auc']

In [None]:
def random_forest(X_train, y_train, train_subjects):
    rf_grid_pipe = Pipeline([('clf', RandomForestClassifier())])
 
    param_grid = {
        'clf__n_estimators': [20,30, 50, 100,200,500,600,700],
        'clf__max_features': ['sqrt', 'log2'],
        'clf__max_depth': [1,3,5,10, 15, 20]}

    grid_rfc = GridSearchCV(rf_grid_pipe, param_grid = param_grid, cv = innercv, scoring= 'roc_auc', verbose = 3)
    grid_rfc.fit(X_train, y_train, groups=train_subjects)
    print('Best estimator: ', grid_rfc.best_estimator_)

    rfc_pipe = grid_rfc.best_estimator_

    rfc_scores = cross_validate(rfc_pipe, X_train, y_train, groups=train_subjects, scoring = scoring, cv=innercv, n_jobs=-1)
    
    
    print('Cross-Validation Evaluation Scores')

    print('AUC:', 'mean-',np.mean(rfc_scores['test_roc_auc']), 'std dev-', np.std(rfc_scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(rfc_scores['test_sensitivity']), 'std dev-', np.std(rfc_scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(rfc_scores['test_specificity']), 'std dev-', np.std(rfc_scores['test_specificity']))
    return grid_rfc.best_params_,grid_rfc.best_estimator_, rfc_scores['test_roc_auc'], rfc_scores['train_roc_auc']

In [None]:
def XGB_classifier(X_train, y_train, train_subjects):
    xgb_grid_pipe = Pipeline([
        ('clf', XGBClassifier())])
    
    param_grid = {'clf__objective': ['binary:logistic'],
                  'clf__use_label_encoder': [False],
                  'clf__random_state': [5],
                  'clf__max_depth':[3],
                  'clf__gamma': [3],
                  'clf__min_child_weight': [4],
                  'clf__reg_lambda': [5],
                  'clf__colsample_bytree': [1]
                 }
    
    grid_xgb = GridSearchCV(xgb_grid_pipe, param_grid = param_grid, cv = innercv, verbose = 0, n_jobs=-1, scoring= 'roc_auc') 
    grid_xgb.fit(X_train, y_train, groups=train_subjects)

    print('Best estimator: ', grid_xgb.best_estimator_)

    xgb_pipe = grid_xgb.best_estimator_
    
    scores = cross_validate(xgb_pipe, X_train, y_train, groups=train_subjects, scoring=scoring, cv=innercv, n_jobs=-1, return_train_score=True)

    print('Cross-Validation Evaluation Scores')

    print('Train AUC:', 'mean-',np.mean(scores['train_roc_auc']), 'std dev-', np.std(scores['train_roc_auc']))
    print('AUC:', 'mean:',np.mean(scores['test_roc_auc']), 'std dev:', np.std(scores['test_roc_auc']))
    print('Sensitivity:', 'mean:',np.mean(scores['test_sensitivity']), 'std dev:', np.std(scores['test_sensitivity']))
    print('Specificity:', 'mean:',np.mean(scores['test_specificity']), 'std dev:', np.std(scores['test_specificity']))
    return grid_xgb.best_params_, grid_xgb.best_estimator_, scores['test_roc_auc'], scores['train_roc_auc']

In [None]:
def nestedCV(Model, X_outer, y_outer, outer_subjects):
    allTestScores = []
    allTrainScores = []
    allCVScores = []
    
    for fold, (train_idxs, test_idxs) in enumerate(outercv.split(X_outer, y_outer, outer_subjects)):
        # Split into train and test
        print("\nREPEAT: ", fold // N_SPLITS_O + 1)
        print("FOLD: ", fold % N_SPLITS_O + 1)
        X_train = X_outer.iloc[train_idxs]
        y_train = y_outer.iloc[train_idxs]
        X_test = X_outer.iloc[test_idxs]
        y_test = y_outer.iloc[test_idxs]
        train_subjects = [outer_subjects[i] for i in train_idxs]
        
        # classification
        print("\nMODEL FITTING")
        model_best_param, best_model, model_cv_scores, model_train_scores = Model(X_train, y_train, train_subjects)
        allTrainScores.append(np.mean(model_train_scores))
        allCVScores.append(np.mean(model_cv_scores))
        
        # prediction
        print("\nINFERENCE")
        auc_score = roc_auc_score(y_test, best_model.predict(X_test))
        allTestScores.append(auc_score)
        print("\nTEST SCORE: ", auc_score)
    
    # Calculate averages for each repeat
    sumRepeat = 0
    for i in range(len(allTestScores)):
        sumRepeat += allTestScores[i]
        if (i + 1) % N_SPLITS_O == 0:
            print("\nAVERAGE TEST SCORE FOR REPEAT {}: {} ".format(i // N_SPLITS_O + 1, sumRepeat/N_SPLITS_O))
            sumRepeat = 0
    
    print("\nAVERAGE TRAIN SCORE ACROSS ALL FOLDS: ", np.mean(allTrainScores))
    print("\nAVERAGE BEST CV SCORE ACROSS ALL FOLDS: ", np.mean(allCVScores))
    print("AVERAGE TEST SCORE ACROSS ALL FOLDS: ", np.mean(allTestScores))
    print("STD TEST SCORE ACROSS ALL FOLDS: ", np.std(allTestScores))

In [None]:
def nestedCV_selection(Model, X_outer, y_outer, outer_subjects):
    allTestScores = []
    allTrainScores = []
    allCVScores = []
    allPreFSScores = []
    
    for fold, (train_idxs, test_idxs) in enumerate(outercv.split(X_outer, y_outer, outer_subjects)):
        # Split into train and test
        print("\nREPEAT: ", fold // N_SPLITS_O + 1)
        print("FOLD: ", fold % N_SPLITS_O + 1)
        X_train = X_outer.iloc[train_idxs]
        y_train = y_outer.iloc[train_idxs]
        X_test = X_outer.iloc[test_idxs]
        y_test = y_outer.iloc[test_idxs]
        train_subjects = [outer_subjects[i] for i in train_idxs]
        
        # feature selection (select top 20 features)
        print("FEATURE SELECTION\n")
        lasso_best_param, lasso_best_model, lasso_scores = Lasso(X_train, y_train, train_subjects)
        allPreFSScores.append(np.mean(lasso_scores))
        coefficients = lasso_best_model.named_steps['clf'].coef_
        importance = np.abs(coefficients).reshape(-1)
        posImportance = importance[importance > 0]
        sortedImp = np.sort(posImportance)
        top20Features = sortedImp[-21:-1]
        selected = []
        for i in top20Features:
            selected.append(feature_names[list(importance).index(i)])
        X_train_sel = X_train[selected]
        X_test_sel = X_test[selected]
        print(selected)
        print(len(selected))

        # classification
        print("MODEL FITTING AND INFERENCE\n")
        model_best_param, best_model, model_cv_scores, model_train_scores = Model(X_train_sel, y_train, train_subjects)
        allTrainScores.append(np.mean(model_train_scores))
        allCVScores.append(np.mean(model_cv_scores))
        
        # prediction
        auc_score = roc_auc_score(y_test, best_model.predict(X_test_sel))
        allTestScores.append(auc_score)
        print("\nTEST SCORE: ", auc_score)
        
    # Calculate averages for each repeat
    sumRepeat = 0
    for i in range(len(allTestScores)):
        sumRepeat += allTestScores[i]
        if (i + 1) % N_SPLITS_O == 0:
            print("\nAVERAGE TEST SCORE FOR REPEAT {}: {} ".format(i // N_SPLITS_O + 1, sumRepeat/N_SPLITS_O))
            sumRepeat = 0

    print("\nAVERAGE PRE-FS SCORE ACROSS ALL FOLDS: ", np.mean(allPreFSScores))
    print("\nAVERAGE TRAIN SCORE ACROSS ALL FOLDS: ", np.mean(allTrainScores))
    print("\nAVERAGE CV SCORE ACROSS ALL FOLDS: ", np.mean(allCVScores))
    print("AVERAGE TEST SCORE ACROSS ALL FOLDS: ", np.mean(allTestScores))
    print("STD TEST SCORE ACROSS ALL FOLDS: ", np.std(allTestScores))
        

## Deep Learning

In [None]:
import tensorflow
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import Adadelta, Adam
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.models import Model
from keras.metrics import AUC
from keras import regularizers
from keras.callbacks import EarlyStopping,CSVLogger
from scikeras.wrappers import KerasClassifier, KerasRegressor

In [None]:
def build_clf(input_dim, layer1, layer2, layer3, dropout, learning_rate):
    # creating the layers of the NN
    input_img = Input(shape=(input_dim,))
    deep = Dense(units=layer1, activation='relu')(input_img)
    deep = Dropout(dropout)(deep)
    deep = BatchNormalization()(deep)
    deep = Dense(layer2, activation='relu')(deep)
    deep = Dropout(dropout)(deep)
    deep = BatchNormalization()(deep)
    deep = Dense(layer3, activation='relu')(deep)
    deep = Dropout(dropout)(deep)
    deep = BatchNormalization()(deep)
    outlayer = Dense(1, activation='sigmoid')(deep)
    model = Model(input_img, outlayer)
    model.compile(loss=['binary_crossentropy'], metrics=[AUC()],#, 'sparse_categorical_accuracy'
                        optimizer= Adam(learning_rate=learning_rate))
    return model

In [None]:
def MLP(X_train, y_train, train_subjects, fold):
    for (train_idxs, test_idxs) in innercv.split(X_train, y_train, train_subjects):
        X_train_inner = X_train.iloc[train_idxs]
        y_train_inner = y_train.iloc[train_idxs]
        X_eval = X_train.iloc[test_idxs]
        y_eval = y_train.iloc[test_idxs]
        break
    
    print("GETTING LASSO IMPORTANCE SCORES\n")
    lasso= Lasso(X_train_inner, y_train_inner)
    coefficients = lasso.named_steps['clf'].coef_
    importance = np.abs(coefficients).reshape(-1)
    selected = np.array(feature_names)[importance > 0]
    print(selected)
    X_train_sel = X_train_inner[selected]
    X_eval_sel = X_eval[selected]
    
    input_dim = len(X_train_sel.columns)
    output_dim = 1
    
    params={'input_dim': [input_dim],
            'layer1': [256],
            'layer2': [128],
            'layer3': [64],
            'dropout': [0.2],
            'learning_rate': [1e-4]
    }

        
    checkpoint_filepath = os.path.join(model_path, 'best_model_weights_{}.hdf5'.format(fold))
    model_checkpoint_callback = tensorflow.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1)
    
    model = KerasClassifier(model=build_clf,input_dim=input_dim, layer1=256,
                            layer2=128, layer3=64, dropout=0.2, learning_rate=1e-4,
                            callbacks=[model_checkpoint_callback])
    
    history = model.fit(X_train_sel, y_train_inner,
                    epochs=200, #int(params['n_epochs']),
                    batch_size=64,
                    shuffle=True,
                    verbose=2,
                    validation_data=(X_eval_sel, y_eval))
        
    best_model = build_clf(input_dim=input_dim, layer1=256,
                            layer2=128, layer3=64, dropout=0.2, learning_rate=1e-4)
    best_model.load_weights(checkpoint_filepath)
    
    return best_model, selected

In [None]:
from collections import defaultdict
def nestedCV_MLP(X_outer, y_outer, outer_subjects):
    allTrainScores = []
    allTestScores = []
    selectedList = []
      
    #normalize data
    sc = StandardScaler()
    X_outer_scaled = pd.DataFrame(sc.fit_transform(X_outer))
    X_outer_scaled.columns = X_outer.columns

    for fold, (train_idxs, test_idxs) in enumerate(outercv.split(X_outer, y_outer, outer_subjects)):
        print("\nREPEAT: ", fold // N_SPLITS_O + 1)
        print("FOLD: ", fold % N_SPLITS_O + 1)
        
        tensorflow.keras.backend.clear_session()
        
        # train test split
        X_train = X_outer_scaled.iloc[train_idxs]
        y_train = y_outer.iloc[train_idxs]
        X_test = X_outer_scaled.iloc[test_idxs]
        y_test = y_outer.iloc[test_idxs]
        train_subjects = [outer_subjects[i] for i in train_idxs]
        
        print("\nFEATURE SELECTION")
        sel_feat = []
        importanceSum = np.zeros(len(feature_names))
        B = 500
        for i in range(B):
            bootstrap = np.random.choice(len(X_train), len(X_train))
            X_sample = X_train.iloc[bootstrap]
            y_sample = y_train.iloc[bootstrap]
            lasso= Lasso(X_sample, y_sample)
            coefficients = lasso.named_steps['clf'].coef_.reshape(-1)
            importanceSum += coefficients
#             selected = np.array(feature_names)[importance > 0]
#             sel_feat.append(selected)
#             print(selected)
        bootstrap_coef = np.abs(importanceSum / B)
        selected = np.array(feature_names)[bootstrap_coef > 0.1]
        print(selected)
#         similarity = []
#         for i in range(len(sel_feat)):
#             for j in range(i + 1, len(sel_feat)):
#                 score = jaccard_similarity(sel_feat[i], sel_feat[j])
#                 similarity.append(score)
#                 print(score)
#         print("MEAN SCORE: ", np.mean(similarity))
        selectedList.append(selected)
        X_train_sel = X_train[selected]
        X_test_sel = X_test[selected]
        
#         print("\nMODEL BUILDING AND FITTING")
#         input_dim = len(X_train_sel.columns)
        
#         checkpoint_filepath = os.path.join(model_path, 'best_model_weights_{}.hdf5'.format(fold))
#         model_checkpoint_callback = tensorflow.keras.callbacks.ModelCheckpoint(
#             filepath=checkpoint_filepath,
#             save_weights_only=True,
#             monitor='val_auc',
#             mode='max',
#             save_best_only=True,
#             verbose=0)
    
#         model = KerasClassifier(model=build_clf,input_dim=input_dim, layer1=64,
#                             layer2=32, layer3=16, dropout=0.25, learning_rate=5e-4,
#                             callbacks=[model_checkpoint_callback])
        
#         history = model.fit(X_train_sel, y_train,
#                     epochs=200, #int(params['n_epochs']),
#                     batch_size=64,
#                     shuffle=True,
#                     verbose=0,
#                     validation_data=(X_test_sel, y_test))
        
#         # plot loss during training
#         plt.subplot(211)
#         plt.title('Loss')
#         plt.plot(history.history_['val_loss'], label='valid')
#         plt.plot(history.history_['loss'], label='train')
#         plt.legend()
#         # plot accuracy during training
#         plt.subplot(212)
#         plt.title('Accuracy')
#         plt.plot(history.history_['val_auc'], label='valid')
#         plt.plot(history.history_['auc'], label='train')
#         plt.legend()
#         plt.show()
        
#         best_model = build_clf(input_dim=input_dim, layer1=64,
#                                 layer2=32, layer3=16, dropout=0.25, learning_rate=5e-4)
#         best_model.load_weights(checkpoint_filepath)
        
        
#         print("\nINFERENCE")
#         # prediction
#         train_auc_score = roc_auc_score(y_train, best_model.predict(X_train_sel, batch_size=32))
#         print(train_auc_score)
#         allTrainScores.append(train_auc_score)
#         test_auc_score = roc_auc_score(y_test, best_model.predict(X_test_sel, batch_size=32))
#         print(test_auc_score)
#         allTestScores.append(test_auc_score)
#         print("\nTEST SCORE: ", test_auc_score)
        
    print("\nAVERAGE TRAIN SCORE ACROSS ALL FOLDS: ", np.mean(allTrainScores))
    print("\nSTD TRAIN SCORE ACROSS ALL FOLDS: ", np.std(allTrainScores))
    print("AVERAGE TEST SCORE ACROSS ALL FOLDS: ", np.mean(allTestScores))
    print("STD TEST SCORE ACROSS ALL FOLDS: ", np.std(allTestScores))
    return selectedList

In [None]:
selectedList = nestedCV_MLP(X,  y, allSubjects)