In [1]:
import numpy as np
import pandas as pd
import random

# Getting Data (only healthy)

## For Split

In [2]:
df = pd.read_csv(r'C:\Users\caron\OneDrive - University of North Carolina at Chapel Hill\Honors Project\CSV Files\MRI Data.csv')
for i in range(0, 1911): # getting only healthy data for sex pipeline
    if (df.loc[i, 'SDx'] == 3):
        df = df.drop([i])
temp = pd.concat([df.loc[0:, ['Sex']], df.loc[0:, 'L_LatVent': 'R_insula_surfavg']], axis=1, sort=False)
temp = temp.dropna(how='any')
df1 = temp.loc[0:, ['Sex']]
df2 = temp.loc[0:, 'L_LatVent': 'R_insula_surfavg']

## For Cross Val

In [3]:
temp2 = pd.concat([df.loc[0:, ['Sex']], df.loc[0:, 'L_LatVent': 'R_insula_surfavg']], axis=1, sort=False).dropna(how='any').sample(frac=1).reset_index(drop=True)  
train = temp2[:523]
test = temp2[523:]
df1_train = train.loc[0:, ['Sex']]
df2_train = train.loc[0:, 'L_LatVent': 'R_insula_surfavg']
df1_test = test.loc[0:, ['Sex']]
df2_test = test.loc[0:, 'L_LatVent': 'R_insula_surfavg']

## To numpy array

In [4]:
y = np.array(df1).ravel()
x = np.array(df2)
y_train = np.array(df1_train).ravel()
x_train = np.array(df2_train)
y_test = np.array(df1_test).ravel()
x_test = np.array(df2_test)

# Data Preprocessing and Normalization Models

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer

In [6]:
minMax = MinMaxScaler()
robust = RobustScaler()
standard = StandardScaler()
quantile = QuantileTransformer()
normal = Normalizer()
power = PowerTransformer()

# SVC Models

In [7]:
from sklearn.svm import SVC, LinearSVC

In [8]:
svc = SVC(probability=True)
lsvc = LinearSVC()

# Pipelines

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
pl1 = Pipeline([('minMax', minMax), ('svc', svc)])
# linearSVC with min max pl1b
pl1b = Pipeline([('minMax', minMax), ('lsvc', lsvc)])
pl1c = Pipeline([('minMax', minMax), ('svc', svc)])# svc/minMax w/ linear kernel
pl2 = Pipeline([('robust', robust), ('svc', svc)])
pl3 = Pipeline([('standard', standard), ('svc', svc)])
pl4 = Pipeline([('quantile', quantile), ('svc', svc)])
pl5 = Pipeline([('normal', normal), ('svc', svc)])
pl6 = Pipeline([('power', power), ('svc', svc)])
plall = Pipeline([('minMax', minMax), ('robust', robust), ('standard', standard), ('quantile', quantile), ('normal', normal), ('power', power), ('svc', svc)])

# Grid Search Cross Validation

In [11]:
from sklearn.model_selection import GridSearchCV

## pl1 (minMax)

In [None]:
parameters1 = {'svc__C':(.9, 1, 1.1, 2, 3, 4, 5), 'svc__gamma':('auto','scale')}
cv1 = GridSearchCV(pl1, parameters1, iid = False, cv=10)
cv1.fit(x, y)

print('best score:', end=" ")
print(cv1.best_score_)
print('\nbest params:', end=" ")
print(cv1.best_params_)
print('\nbest index:', end=" ")
print(cv1.best_index_)
#print('\nbest estimator:')
#print(cv1.best_estimator_)
#print(pd.DataFrame(data=cv1.cv_results_))

## pl1b (minMax w/ lsvc)

* Transform features by scaling each feature to a given range.
* Apparently, LinearSVC would be better for large (10's of thousands) data sets

In [13]:
parameters1b = {'lsvc__C':(.07, .08, .09, .1, 1.1, 1.2, 1.3)}
cv1b = GridSearchCV(pl1b, parameters1b, iid = False, cv=10)
cv1b.fit(x, y)

print('best score:', end=" ")
print(cv1b.best_score_)
print('\nbest params:', end=" ")
print(cv1b.best_params_)
print('\nbest index:', end=" ")
print(cv1b.best_index_)
#print('\nbest estimator:')
#print(cv1b.best_estimator_)
#print(pd.DataFrame(data=cv1b.cv_results_))



best score: 0.7394047619047619

best params: {'lsvc__C': 0.1}

best index: 3




## pl1c (minMax w/ linear kernel only)

* Sigmoid and linear for kernel: will always use linear because sigmoid is worse
* For surfice model

In [None]:
parameters1c = {'svc__kernel':('sigmoid', 'linear'), 'svc__C':(1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8)}
cv1c = GridSearchCV(pl1c, parameters1c, iid = False, cv=5)
cv1c.fit(x, y)

print('best score:', end=" ")
print(cv1c.best_score_)
print('\nbest params:', end=" ")
print(cv1c.best_params_)
print('\nbest index:', end=" ")
print(cv1c.best_index_)
#print('\nbest estimator:')
#print(cv1c.best_estimator_)
#print(pd.DataFrame(data=cv1c.cv_results_))

## pl2 (robust)

* Scale features using statistics that are robust to outliers.
* use pl1 to narrow down to rbf for al other normalizers/processors
* made decision based on scores to us rbf for rest

In [None]:
parameters2 = {'svc__kernel':('rbf', 'linear'), 'svc__C':(.1, 1, 10, 100), 'svc__gamma':('auto','scale')}
cv2 = GridSearchCV(pl2, parameters2, iid = False, cv=5)
cv2.fit(x, y)

print('best score:', end=" ")
print(cv2.best_score_)
print('\nbest params:', end=" ")
print(cv2.best_params_)
print('\nbest index:', end=" ")
print(cv2.best_index_)
#print('\nbest estimator:')
#print(cv2.best_estimator_)
#print(pd.DataFrame(data=cv2.cv_results_))

## pl3 (standard)

* 	Standardize features by removing the mean and scaling to unit variance
* use pl1 to narrow down to rbf

In [None]:
parameters3 = {'svc__kernel':('rbf', 'linear'), 'svc__C':(.1, 1, 10, 100), 'svc__gamma':('auto','scale')}
cv3 = GridSearchCV(pl3, parameters3, iid = False, cv=5)
cv3.fit(x, y)

print('best score:', end=" ")
print(cv3.best_score_)
print('\nbest params:', end=" ")
print(cv3.best_params_)
print('\nbest index:', end=" ")
print(cv3.best_index_)
#print('\nbest estimator:')
#print(cv3.best_estimator_)
#print(pd.DataFrame(data=cv3.cv_results_))

## pl4 (quantile)

* Transform features using quantiles information.
* use pl1 to narrow down to rbf

In [None]:
parameters4 = {'svc__kernel':('rbf', 'linear'), 'svc__C':(.1, 1, 10, 100), 'svc__gamma':('auto','scale')}
cv4 = GridSearchCV(pl4, parameters4, iid = False, cv=5)
cv4.fit(x, y)

print('best score:', end=" ")
print(cv4.best_score_)
print('\nbest params:', end=" ")
print(cv4.best_params_)
print('\nbest index:', end=" ")
print(cv4.best_index_)
#print('\nbest estimator:')
#print(cv4.best_estimator_)
#print(pd.DataFrame(data=cv4.cv_results_))

## pl5 (normal)

* Normalize samples individually to unit norm.
* use pl1 to narrow down to rbf

In [None]:
parameters5 = {'svc__kernel':('rbf', 'linear'), 'svc__C':(.1, 1, 10, 100), 'svc__gamma':('auto','scale')}
cv5 = GridSearchCV(pl5, parameters5, iid = False, cv=5)
cv5.fit(x, y)

print('best score:', end=" ")
print(cv5.best_score_)
print('\nbest params:', end=" ")
print(cv5.best_params_)
print('\nbest index:', end=" ")
print(cv5.best_index_)
#print('\nbest estimator:')
#print(cv5.best_estimator_)
#print(pd.DataFrame(data=cv5.cv_results_))

## pl6 (power)

* Apply a power transform featurewise to make data more Gaussian-like.
* use pl1 to narrow down to rbf

In [None]:
parameters6 = {'svc__kernel':('rbf', 'linear'), 'svc__C':(.1, 1, 10, 100), 'svc__gamma':('auto','scale')}
cv6 = GridSearchCV(pl6, parameters6, iid = False, cv=5)
cv6.fit(x, y)

print('best score:', end=" ")
print(cv6.best_score_)
print('\nbest params:', end=" ")
print(cv6.best_params_)
print('\nbest index:', end=" ")
print(cv6.best_index_)
#print('\nbest estimator:')
#print(cv6.best_estimator_)
#print(pd.DataFrame(data=cv6.cv_results_))

# Test Parameter Notes

minMax: best score: best score: 0.7717460317460318 {'svc__C': 1, 'svc__gamma': 'scale'}

lin minMax: best score: best score: best score: 0.7394047619047619 {'lsvc__C': 0.1}

minMax SVC w/ linear kernel:

robust:

standard:

quantile:

normal:

power: