# Sex Classification Using HE

## Data Preparation

In [None]:
#load in modules and libraries
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [None]:
#read in subj_num and test_holdout indices
num = pd.read_csv('subj_num.txt', header=None)
num = num.values
test = pd.read_csv('test_holdout.csv', header=None)
test = test.values

In [None]:
#load in HE and sex data
#data contains HE for all subjects x ROIs based on the atlas being used
data = pd.read_csv('he_fs86.csv', header=0)
data = data.values
#sex is binary variable - males are '1', females are '0'
sex = pd.read_csv('subj_sex.csv', header=None)
sex = sex.values

In [None]:
#split data into training and testing sets using pre-determined indices (generated in HCP_HE_sexclf_traintestsplit notebook)
#this was done this way to make sure all atlases have the exact same train-test split 

#get indices for training and testing subsets
train_idx=np.isin(num,test,invert=True)
test_idx=np.isin(num,test)

#partition data and sex labels into training and testing subsets
x_train=data[train_idx.ravel(),:]
x_test=data[test_idx.ravel(),:]
y_train=sex[train_idx.ravel(),:]
y_test=sex[test_idx.ravel(),:]

## Hyperparameter Tuning Using Nested CV

In [None]:
#set all the hyperparameters you want to tune using nested CV
#param_grid = {'clf__kernel': ['linear','rbf'], 'clf__C': [1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5], 'clf__gamma': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]}
param_grid = {'clf__kernel': ['linear'], 'clf__C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]}

#choose specific metrics that you want to look at during CV
scores = ['roc_auc']
#scores = ['accuracy', 'roc_auc', 'precision','recall','balanced_accuracy']


#set number of iterations through nested CV loop
iterations=100

#set up pipeline for analysis so that data transformation takes place within each CV fold
cv_steps = [('scaler', StandardScaler()), ('clf', SVC(max_iter=100000))]
cv_pipeline = Pipeline(cv_steps)

#create array to store nested CV scores
nested_scores = np.zeros(iterations)

best_params = ['']*iterations

#set up nested CV pipeline
for i in range(iterations):
    
    print("Nested CV - Loop %d" % (i+1))
    
    #set parameters for inner and outer loops for CV
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    
    
    for scoring in scores:
        #print specific hyperparameter being tuned
        #print("Tuning hyperparameters for %s" % score)

        #define classifier with pipeline and grid-search CV for inner loop
        clf = GridSearchCV(cv_pipeline, param_grid=param_grid, cv=inner_cv, 
                           scoring='%s' % scoring, n_jobs=-1, iid=False, refit=True, verbose=0)

        #fit classifier
        clf.fit(x_train, y_train.ravel())

        #save parameters corresponding to the best score
        best_params[i] = clf.best_params_
        #print("Best Parameters = %s" % clf.best_params_)
        #print()
        
        #print detailed classification report
        #print("Detailed classification report:")
        #print("The model is trained on the full development set.")
        #print("The scores are computed on the full validation set.")
        #print()
        #y_true, y_pred = y_val, clf.predict(x_val)
        #print(classification_report(y_true, y_pred))
        
        #call cross_val_score for outer loop
        nested_score = cross_val_score(clf, X=x_train, y=y_train.ravel(), cv=outer_cv, 
                                       scoring='%s' % scoring, verbose=0)
        nested_scores[i] = nested_score.mean()
        

In [None]:
best_params

In [None]:
nested_scores

In [None]:
#identify best score based on nested CV 
best_score=np.amax(nested_scores)
#print(max_score)
#identify specific parameters associated with best score
idx=np.where(nested_scores == best_score)
best_params[np.asarray(idx)[0][0]]

## Fit Model with Optimized Hyperparameters

In [None]:
#normalize all training data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_t = scaler.transform(x_train)
x_test_t = scaler.transform(x_test)

In [None]:
#generate SVC model based on optimized parameters 
#can use mean hyperparamaters of n best models where n is iterations in hyperparameter search
#or can use hyperparameter of best models out of n best models 
model = SVC(C=0.1, kernel='linear');
model.fit(x_train_t, y_train.ravel());
print("Score = %1.4f" %(model.score(x_test_t,y_test)))

## Generate Confusion Matrix

In [None]:
#plot confusion matrix
from yellowbrick.classifier import ConfusionMatrix
from sklearn import preprocessing
from matplotlib import pyplot as plt

#transform binary labels to 'Female' and 'Male'
le = preprocessing.LabelEncoder()
le.fit(["F", "M"])
y_train_label=le.inverse_transform(y_train.ravel())
y_test_label=le.inverse_transform(y_test.ravel())

fig, ax = plt.subplots(figsize=(4,3))
fig.suptitle('FS86 - SVM Linear Kernel with C=0.1', fontsize=12)
ax.set_xlabel('xlabel', fontsize=12)
ax.set_ylabel('ylabel', fontsize=12)

cm = ConfusionMatrix(model, classes=['M','F'], cmap='Blues', fontsize=12, ax=ax, title=' ')


#fit the model
cm.fit(x_train_t, y_train_label)

#creates confusion matrix
cm.score(x_test_t, y_test_label)

#plot confusion matrix
cm.poof()

#save confusion matrix image
fig.savefig('cm_fs86.svg')

## Generate ROC Curve

In [None]:
#plot ROC curve for classifier
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

#set classes
classes=['female', 'male']

fig, ax = plt.subplots(figsize=(4,3))
ax.set_frame_on(False)
fig.suptitle('FS86 - SVM Linear Kernel with C=0.1', fontsize=12)
ax.set_xlabel('xlabel', fontsize=12)
ax.tick_params(axis='both', which='major', labelsize=10)
ax.set_ylabel('ylabel', fontsize=12)

#ax.grid(False)

visualizer = ROCAUC(model, classes=['M','F'], macro=False, micro=False, per_class=False, 
                    fontsize=16, ax=ax, title=' ')

#Fit the training data to the visualizer
visualizer.fit(x_train_t, y_train.ravel())

#Evaluate the model on the test data
visualizer.score(x_test_t, y_test.ravel()) 

#Draw/show/poof the data
visualizer.poof()

#save confusion matrix
fig.savefig('roc_fs86.svg')

## Plot Feature Importance

In [None]:
#import labels file pertaining to atlas being used for training/testing
labels = pd.read_csv('fs86_labels.csv', header=0)

#define feature importances graph
def f_importances(coef, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    # Show all features
    if top == -1:
        top = len(names)
    plt.figure(figsize=(5,3))
    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.show()


# Specify the top n features you want to visualize.
# can use np.square, abs, or neither if you want to see positive and negative coefficients
f_importances(np.square(model.coef_[0]), labels, top=10)

#write model feature importances to csv file
import csv
with open('featimp_tt.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(model.coef_)