In [1]:
import pandas as pd
import numpy as np
import time
import csv
import os

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
#s3 = boto3.resource('s3')

In [2]:
#load in HE and sex data
#data contains HE for all subjects x ROIs based on the atlas being used
#filename.csv is specific to atlas 
data = pd.read_csv('he_tt_vm.csv', header=0)
data = data.values
#sex is binary variable - males are '1', females are '0'
sex = pd.read_csv('subj_sex_vm.csv', header=0)
sex = sex.values
#read in subj_num  indices
num = pd.read_csv('subj_num_vm.csv', header=0)
num = num.values

In [3]:
#create logical variable to identify males and females
male = sex==1
female = sex==0

#save male and female he data into separate arrays
#idx_male = num[male.ravel(),:]
#idx_female = num[female.ravel(),:]

In [4]:
#load indices for all test holdout sets - to keep it consistent across all atlases
test_indices = pd.read_csv('test_holdout_vm_10perm.txt', header=None)
test_indices = test_indices.values

In [5]:
#set the number of random permutations you want to do
permutations = 10

#create variable to save optimised_c from each of the iterations 
#subj_idx = np.zeros([permutations,422])
optimised_c = np.zeros(permutations)
accuracy = np.zeros(permutations)
auc = np.zeros(permutations)
predictions = np.zeros([permutations,85])
#test_y = np.zeros([permutations,85])
feat_imp = np.zeros([permutations,data.shape[1]])


for idx in range(permutations):
    
    print("Permutation %d" % (idx + 1))
    print(time.localtime(time.time()))
    
    #randomly generate subset of 211 males and 211 females
    #idx_male_211 = idx_male[np.random.choice(idx_male.shape[0], 211, replace=False), :]
    #idx_female_211 = idx_female[np.random.choice(idx_female.shape[0], 211, replace=False), :]
    
    #create logical vector to identify males and females chosen by random permutation
    #male_idx = np.isin(num,idx_male_211)
    #female_idx = np.isin(num,idx_female_211)
    
    #generate arrays of HE data for males and females
    #data_male_211 = data[male_idx.ravel(),:]
    #data_female_211 = data[female_idx.ravel(),:]
    
    #generate arrays of sex labels for males and females
    #sex_male = sex[male_idx.ravel(),:]
    #sex_female = sex[female_idx.ravel(),:]
    
    #concatenate male and female data together
    #idx_422 = np.concatenate([idx_male_211,idx_female_211])
    #data_422 = np.concatenate([data_male_211,data_female_211])
    #sex_422 = np.concatenate([sex_male,sex_female])
    
    #save subject indices
    #subj_idx[idx,:] = idx_422.ravel()
    
    #split data into train and test subsets
    #x_train, x_test, y_train, y_test = train_test_split(data, sex, test_size = 0.2, stratify = sex, shuffle=True)
    
    #split data into training and testing sets using pre-determined indices (generated in HCP_HE_sexclf_traintestsplit notebook)
    #this was done this way to make sure all atlases have the exact same train-test split 
    
    test=test_indices[idx,:]

    #get indices for training and testing subsets
    train_idx=np.isin(num,test,invert=True)
    test_idx=np.isin(num,test)

    #partition data and sex labels into training and testing subsets
    x_train=data[train_idx.ravel(),:]
    x_test=data[test_idx.ravel(),:]
    y_train=sex[train_idx.ravel(),:]
    y_test=sex[test_idx.ravel(),:]
    
    
    #set all the hyperparameters you want to tune using nested CV
    #param_grid = {'clf__kernel': ['linear'], 'clf__C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]}
    param_grid = {'clf__kernel': ['linear'], 'clf__C': [x*0.01 + 0.01 for x in range(100)]}


    #choose specific metrics that you want to look at during CV
    #scores = ['roc_auc']
    #scores = ['accuracy', 'roc_auc', 'precision','recall','balanced_accuracy']


    #set number of iterations through nested CV loop
    iterations=100
    
    #set up pipeline for analysis so that data transformation takes place within each CV fold
    cv_steps = [('scaler', StandardScaler()), ('clf', SVC(max_iter=100000))]
    cv_pipeline = Pipeline(cv_steps)

    #create array to store nested CV scores
    nested_scores = np.zeros(iterations)

    best_params = ['']*iterations

    #set up nested CV pipeline
    for iter in range(iterations):
    
        print("Nested CV - Loop %d" % (iter + 1))

        #set parameters for inner and outer loops for CV
        inner_cv = StratifiedKFold(n_splits = 5, shuffle = True)
        outer_cv = StratifiedKFold(n_splits = 5, shuffle = True)
    
    
        #print specific hyperparameter being tuned
        #print("Tuning hyperparameters for %s" % score)

        #define classifier with pipeline and grid-search CV for inner loop
        clf = GridSearchCV(cv_pipeline, param_grid = param_grid, cv = inner_cv, 
                           scoring='roc_auc', n_jobs = -1, iid = False, refit = True, verbose=0)

        #fit classifier
        clf.fit(x_train, y_train.ravel())

        #save parameters corresponding to the best score
        best_params[iter] = clf.best_params_
        #print("Best Parameters = %s" % clf.best_params_)
        #print()

        #print detailed classification report
        #print("Detailed classification report:")
        #print("The model is trained on the full development set.")
        #print("The scores are computed on the full validation set.")
        #print()
        #y_true, y_pred = y_val, clf.predict(x_val)
        #print(classification_report(y_true, y_pred))

        #call cross_val_score for outer loop
        nested_score = cross_val_score(clf, X = x_train, y = y_train.ravel(), cv = outer_cv, 
                                       scoring = 'roc_auc', verbose = 0)
        nested_scores[iter] = nested_score.mean()

   
    with open('temp.txt', 'w') as f:
        for listitem in best_params:
            f.write('%s\n' % listitem)
        
    #extract best C parameter from all iterations and compute mean 
    cv_params = pd.read_csv('temp.txt', sep='\s+', header=None)
    cv_params = cv_params[1].str.replace(',','')
    best_c = np.zeros(iterations)
    for i in range(len(best_c)):
        best_c[i] = eval(cv_params[i])
    optimised_c[idx] = np.mean(best_c)
    #print("Optimised C = %1.4f" %(optimised[idx]))
    
    #normalize all training data
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train_t = scaler.transform(x_train)
    x_test_t = scaler.transform(x_test)
    
    #generate SVC model based on optimised parameters 
    #uses optimised_c hyperparameter optained by averaging hyperparameters over grid search iterations
    model = SVC(C = optimised_c[idx], kernel='linear', probability=True);
    model.fit(x_train_t, y_train.ravel());
    
    #generate prediction probailities for test samples
    pred = model.predict_proba(x_test_t);
    predictions[idx,:] = pred[:,1]
    
    #save y_test so ROC curves can be generated later
    #test_y[idx,:] = y_test.ravel()
    
    #determine model accuracy and store it in score variable
    accuracy[idx] = model.score(x_test_t,y_test.ravel())
    #print("Accuracy = %1.4f" %(accuracy[idx]))
    
    #determine model AUC and store it in AUC variable
    auc[idx] = metrics.roc_auc_score(y_test.ravel(), pred[:,1]) 
    #print("AUC = %1.4f" %(auc[idx]))
    
    #save feature importances
    feat_imp[idx] = model.coef_[0]
    
    #save at the end of each permutation so if something does go wrong, it's all saved 
    #just gets overwritten after each permutation
    np.savetxt('tt_optimisedc_vm.txt', optimised_c, delimiter=',')
    np.savetxt('tt_acc_vm.txt', accuracy, delimiter=',')
    np.savetxt('tt_ypred_vm.txt', predictions, delimiter=',')
    #np.savetxt('fs86_y_test_vm.txt', test_y, delimiter=',')
    np.savetxt('tt_auc_vm.txt', auc, delimiter=',')
    #np.savetxt('fs86_subj_idx_vm.txt', subj_idx, delimiter=',')
    np.savetxt('tt_featimp_vm.txt', feat_imp, delimiter=',')
    
    #also upload everything to s3 bucket incase something crashes
    #! aws s3 cp 'hcp1_optimised_c_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
    #! aws s3 cp 'hcp1_accuracy_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
    #! aws s3 cp 'hcp1_y_pred_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
    #! aws s3 cp 'hcp1_y_test_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
    #! aws s3 cp 'hcp1_auc_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
    #! aws s3 cp 'hcp1_subj_idx_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
    #! aws s3 cp 'hcp1_feat_imp_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
    
np.savetxt('tt_optimisedc_vm.txt', optimised_c, delimiter=',')
np.savetxt('tt_acc_vm.txt', accuracy, delimiter=',')
np.savetxt('tt_ypred_vm.txt', predictions, delimiter=',')
#np.savetxt('fs86_y_test_vm.txt', test_y, delimiter=',')
np.savetxt('tt_auc_vm.txt', auc, delimiter=',')
#np.savetxt('fs86_subj_idx_vm.txt', subj_idx, delimiter=',')
np.savetxt('tt_featimp_vm.txt', feat_imp, delimiter=',')
        
#save once again at the end just for funsies        
#! aws s3 cp 'hcp1_optimised_c_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
#! aws s3 cp 'hcp1_accuracy_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
#! aws s3 cp 'hcp1_y_pred_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
#! aws s3 cp 'hcp1_y_test_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
#! aws s3 cp 'hcp1_auc_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
#! aws s3 cp 'hcp1_subj_idx_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/
#! aws s3 cp 'hcp1_feat_imp_volmatch.txt' s3://kuceyeski-wcm-temp/eld2024/

#print("All permutations complete and data saved.")
#print("Stopping instance now.")

#stop instance once everything is completed
#! /home/ubuntu/klaws stop
print(time.localtime(time.time()))

Permutation 1
time.struct_time(tm_year=2019, tm_mon=8, tm_mday=23, tm_hour=14, tm_min=39, tm_sec=15, tm_wday=4, tm_yday=235, tm_isdst=1)
Nested CV - Loop 1
Nested CV - Loop 2
Nested CV - Loop 3
Nested CV - Loop 4
Nested CV - Loop 5
Nested CV - Loop 6
Nested CV - Loop 7
Nested CV - Loop 8
Nested CV - Loop 9
Nested CV - Loop 10
Nested CV - Loop 11
Nested CV - Loop 12
Nested CV - Loop 13
Nested CV - Loop 14
Nested CV - Loop 15
Nested CV - Loop 16
Nested CV - Loop 17
Nested CV - Loop 18
Nested CV - Loop 19
Nested CV - Loop 20
Nested CV - Loop 21
Nested CV - Loop 22
Nested CV - Loop 23
Nested CV - Loop 24
Nested CV - Loop 25
Nested CV - Loop 26
Nested CV - Loop 27
Nested CV - Loop 28
Nested CV - Loop 29
Nested CV - Loop 30
Nested CV - Loop 31
Nested CV - Loop 32
Nested CV - Loop 33
Nested CV - Loop 34
Nested CV - Loop 35
Nested CV - Loop 36
Nested CV - Loop 37
Nested CV - Loop 38
Nested CV - Loop 39
Nested CV - Loop 40
Nested CV - Loop 41
Nested CV - Loop 42
Nested CV - Loop 43
Nested CV - 

Nested CV - Loop 85
Nested CV - Loop 86
Nested CV - Loop 87
Nested CV - Loop 88
Nested CV - Loop 89
Nested CV - Loop 90
Nested CV - Loop 91
Nested CV - Loop 92
Nested CV - Loop 93
Nested CV - Loop 94
Nested CV - Loop 95
Nested CV - Loop 96
Nested CV - Loop 97
Nested CV - Loop 98
Nested CV - Loop 99
Nested CV - Loop 100
Permutation 5
time.struct_time(tm_year=2019, tm_mon=8, tm_mday=23, tm_hour=17, tm_min=34, tm_sec=16, tm_wday=4, tm_yday=235, tm_isdst=1)
Nested CV - Loop 1
Nested CV - Loop 2
Nested CV - Loop 3
Nested CV - Loop 4
Nested CV - Loop 5
Nested CV - Loop 6
Nested CV - Loop 7
Nested CV - Loop 8
Nested CV - Loop 9
Nested CV - Loop 10
Nested CV - Loop 11
Nested CV - Loop 12
Nested CV - Loop 13
Nested CV - Loop 14
Nested CV - Loop 15
Nested CV - Loop 16
Nested CV - Loop 17
Nested CV - Loop 18
Nested CV - Loop 19
Nested CV - Loop 20
Nested CV - Loop 21
Nested CV - Loop 22
Nested CV - Loop 23
Nested CV - Loop 24
Nested CV - Loop 25
Nested CV - Loop 26
Nested CV - Loop 27
Nested CV -

Nested CV - Loop 69
Nested CV - Loop 70
Nested CV - Loop 71
Nested CV - Loop 72
Nested CV - Loop 73
Nested CV - Loop 74
Nested CV - Loop 75
Nested CV - Loop 76
Nested CV - Loop 77
Nested CV - Loop 78
Nested CV - Loop 79
Nested CV - Loop 80
Nested CV - Loop 81
Nested CV - Loop 82
Nested CV - Loop 83
Nested CV - Loop 84
Nested CV - Loop 85
Nested CV - Loop 86
Nested CV - Loop 87
Nested CV - Loop 88
Nested CV - Loop 89
Nested CV - Loop 90
Nested CV - Loop 91
Nested CV - Loop 92
Nested CV - Loop 93
Nested CV - Loop 94
Nested CV - Loop 95
Nested CV - Loop 96
Nested CV - Loop 97
Nested CV - Loop 98
Nested CV - Loop 99
Nested CV - Loop 100
Permutation 9
time.struct_time(tm_year=2019, tm_mon=8, tm_mday=23, tm_hour=20, tm_min=21, tm_sec=29, tm_wday=4, tm_yday=235, tm_isdst=1)
Nested CV - Loop 1
Nested CV - Loop 2
Nested CV - Loop 3
Nested CV - Loop 4
Nested CV - Loop 5
Nested CV - Loop 6
Nested CV - Loop 7
Nested CV - Loop 8
Nested CV - Loop 9
Nested CV - Loop 10
Nested CV - Loop 11
Nested CV -

In [41]:
acc = pd.read_csv('aal_acc_vm.txt', header=None)
auc = pd.read_csv('aal_auc_vm.txt', header=None)

In [42]:
mean = np.mean(acc)
print("Mean Accuracy = %1.4f" % mean)
std = np.std(acc)
print("Std Accuracy = %1.3f" % std)
mean = np.mean(auc)
print("Mean AUC = %1.4f" % mean)
std = np.std(auc)
print("Std AUC = %1.3f" % std)

Mean Accuracy = 0.7059
Std Accuracy = 0.032
Mean AUC = 0.7709
Std AUC = 0.041
