In [1]:
import numpy as np
import pandas as pd
import random

# Getting Data

In [2]:
df = pd.read_csv(r'C:\Users\caron\OneDrive - University of North Carolina at Chapel Hill\Honors Project\CSV Files\MRI Data.csv')
for i in range(0, 1911): # getting only healthy data for sex pipeline
    if (df.loc[i, 'SDx'] == 3):
        df = df.drop([i])
avg_age = df['Age'].mean() # avearage age of healthy subjects
for i in range(0, df.shape[0]):
    if (df.loc[i, 'Age'] > avg_age):
        df.at[i, 'Age'] = 4
    else:
        df.at[i, 'Age'] = 0

## For Cross Val

In [3]:
temp = pd.concat([df.loc[0:, ['Age']], df.loc[0:, 'L_LatVent': 'R_insula_surfavg']], axis=1, sort=False)
temp = temp.dropna(how='any')
df1 = temp.loc[0:, ['Age']]
df2 = temp.loc[0:, 'L_LatVent': 'R_insula_surfavg']

## For Split

In [4]:
temp2 = pd.concat([df.loc[0:, ['Age']], df.loc[0:, 'L_LatVent': 'R_insula_surfavg']], axis=1, sort=False).dropna(how='any').sample(frac=1).reset_index(drop=True)  
train = temp2[:523]
test = temp2[523:]
df1_train = train.loc[0:, ['Age']]
df2_train = train.loc[0:, 'L_LatVent': 'R_insula_surfavg']
df1_test = test.loc[0:, ['Age']]
df2_test = test.loc[0:, 'L_LatVent': 'R_insula_surfavg']

## To numpy array

In [5]:
y = np.array(df1).ravel()
x = np.array(df2)
y_train = np.array(df1_train).ravel()
x_train = np.array(df2_train)
y_test = np.array(df1_test).ravel()
x_test = np.array(df2_test)

# Data Preprocessing and Normalization Models

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer

In [7]:
minMax = MinMaxScaler()
robust = RobustScaler()
standard = StandardScaler()
quantile = QuantileTransformer()
normal = Normalizer()
power = PowerTransformer()

# SVC Models

In [8]:
from sklearn.svm import SVC, LinearSVC

In [9]:
svc = SVC(probability=True, gamma ='scale')
lsvc = LinearSVC()

# Pipelines

In [10]:
from sklearn.pipeline import Pipeline

In [11]:
pl1 = Pipeline([('minMax', minMax), ('svc', svc)])
# linearSVC with min max pl1b
pl1b = Pipeline([('minMax', minMax), ('lsvc', lsvc)])
pl1c = Pipeline([('minMax', minMax), ('svc', svc)])# svc/minMax w/ linear kernel
pl2 = Pipeline([('robust', robust), ('svc', svc)])
pl3 = Pipeline([('standard', standard), ('svc', svc)])
pl4 = Pipeline([('quantile', quantile), ('svc', svc)])
pl5 = Pipeline([('normal', normal), ('svc', svc)])
pl6 = Pipeline([('power', power), ('svc', svc)])
plall = Pipeline([('minMax', minMax), ('robust', robust), ('standard', standard), ('quantile', quantile), ('normal', normal), ('power', power), ('svc', svc)])

# Grid Search Cross Validation

In [12]:
from sklearn.model_selection import GridSearchCV

## pl1 (minMax)

In [13]:
parameters1 = {'svc__C':(0.8, 0.9, 1)}
cv1 = GridSearchCV(pl1, parameters1, iid = False, cv=10)
cv1.fit(x, y)

print('best score:', end=" ")
print(cv1.best_score_)
print('\nbest params:', end=" ")
print(cv1.best_params_)
print('\nbest index:', end=" ")
print(cv1.best_index_)
#print('\nbest estimator:')
#print(cv1.best_estimator_)
#print(pd.DataFrame(data=cv1.cv_results_))

best score: 0.7153834115805946

best params: {'svc__C': 0.9}

best index: 1


## pl1b (minMax w/ lsvc)

* Transform features by scaling each feature to a given range.
* LinearSVC would be better for large (10's of thousands) data sets

In [14]:
parameters1b = {'lsvc__C':(0.03, 0.04, 0.05)}
cv1b = GridSearchCV(pl1b, parameters1b, iid = False, cv=10)
cv1b.fit(x, y)

print('best score:', end=" ")
print(cv1b.best_score_)
print('\nbest params:', end=" ")
print(cv1b.best_params_)
print('\nbest index:', end=" ")
print(cv1b.best_index_)
#print('\nbest estimator:')
#print(cv1b.best_estimator_)
#print(pd.DataFrame(data=cv1b.cv_results_))

best score: 0.7293298680974738

best params: {'lsvc__C': 0.04}

best index: 1


## pl1c (minMax w/ linear kernel only)

* Sigmoid and linear for kernel: will always use linear because sigmoid is worse
* For surfice model

In [15]:
parameters1c = {'svc__kernel':('sigmoid', 'linear'), 'svc__C':(.09, .1, .2)}
cv1c = GridSearchCV(pl1c, parameters1c, iid = False, cv=10)
cv1c.fit(x, y)

print('best score:', end=" ")
print(cv1c.best_score_)
print('\nbest params:', end=" ")
print(cv1c.best_params_)
print('\nbest index:', end=" ")
print(cv1c.best_index_)
#print('\nbest estimator:')
#print(cv1c.best_estimator_)
#print(pd.DataFrame(data=cv1c.cv_results_))

best score: 0.7222071316789627

best params: {'svc__C': 0.1, 'svc__kernel': 'linear'}

best index: 3


## pl2 (robust)

* Scale features using statistics that are robust to outliers.
* use pl1 to narrow down to rbf for al other normalizers/processors
* made decision based on scores to us rbf for rest

In [16]:
parameters2 = {'svc__kernel':('sigmoid', 'linear'), 'svc__C':(.01, .02)}
cv2 = GridSearchCV(pl2, parameters2, iid = False, cv=10)
cv2.fit(x, y)

print('best score:', end=" ")
print(cv2.best_score_)
print('\nbest params:', end=" ")
print(cv2.best_params_)
print('\nbest index:', end=" ")
print(cv2.best_index_)
#print('\nbest estimator:')
#print(cv2.best_estimator_)
#print(pd.DataFrame(data=cv2.cv_results_))

best score: 0.7322864967583278

best params: {'svc__C': 0.01, 'svc__kernel': 'linear'}

best index: 1


## pl3 (standard)

* 	Standardize features by removing the mean and scaling to unit variance
* use pl1 to narrow down to rbf

In [17]:
parameters3 = {'svc__C':(.9, 1, 1.1)}
cv3 = GridSearchCV(pl3, parameters3, iid = False, cv=10)
cv3.fit(x, y)

print('best score:', end=" ")
print(cv3.best_score_)
print('\nbest params:', end=" ")
print(cv3.best_params_)
print('\nbest index:', end=" ")
print(cv3.best_index_)
#print('\nbest estimator:')
#print(cv3.best_estimator_)
#print(pd.DataFrame(data=cv3.cv_results_))

best score: 0.7224463447350771

best params: {'svc__C': 1}

best index: 1


## pl4 (quantile)

* Transform features using quantiles information.
* use pl1 to narrow down to rbf

In [18]:
parameters4 = {'svc__kernel':('sigmoid', 'linear'), 'svc__C':(.08, .09, .1)}
cv4 = GridSearchCV(pl4, parameters4, iid = False, cv=10)
cv4.fit(x, y)

print('best score:', end=" ")
print(cv4.best_score_)
print('\nbest params:', end=" ")
print(cv4.best_params_)
print('\nbest index:', end=" ")
print(cv4.best_index_)
#print('\nbest estimator:')
#print(cv4.best_estimator_)
#print(pd.DataFrame(data=cv4.cv_results_))

  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantile

  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))


best score: 0.7337161860049184

best params: {'svc__C': 0.09, 'svc__kernel': 'linear'}

best index: 3


## pl5 (normal)

* Normalize samples individually to unit norm.
* use pl1 to narrow down to rbf

In [19]:
parameters5 = {'svc__C':(4, 5, 6)}
cv5 = GridSearchCV(pl5, parameters5, iid = False, cv=10)
cv5.fit(x, y)

print('best score:', end=" ")
print(cv5.best_score_)
print('\nbest params:', end=" ")
print(cv5.best_params_)
print('\nbest index:', end=" ")
print(cv5.best_index_)
#print('\nbest estimator:')
#print(cv5.best_estimator_)
#print(pd.DataFrame(data=cv5.cv_results_))

best score: 0.6827246814218645

best params: {'svc__C': 5}

best index: 1


## pl6 (power)

* Apply a power transform featurewise to make data more Gaussian-like.
* use pl1 to narrow down to rbf

In [20]:
parameters6 = {'svc__C':(.9, 1, 1.1)}
cv6 = GridSearchCV(pl6, parameters6, iid = False, cv=10)
cv6.fit(x, y)

print('best score:', end=" ")
print(cv6.best_score_)
print('\nbest params:', end=" ")
print(cv6.best_params_)
print('\nbest index:', end=" ")
print(cv6.best_index_)
#print('\nbest estimator:')
#print(cv6.best_estimator_)
#print(pd.DataFrame(data=cv6.cv_results_))

  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())


best score: 0.7182986809747373

best params: {'svc__C': 1}

best index: 1


# Test Parameter Notes

minMax: best score: 0.7153834115805946 {'svc__C': 0.9, gamma='scale'}

lin minMax: best score: 0.7293298680974738 {'lsvc__C': 0.04}

minMax SVC w/ linear kernel: best score: 0.7222071316789627 {'svc__C': 0.1, 'svc__kernel': 'linear'}

robust: best score: 0.7322864967583278 {'svc__C': 0.01, 'svc__kernel': 'linear'}

standard: best score: 0.7224463447350771 {'svc__C': 1, gamma='scale'}

quantile: best score: 0.7337161860049184 {'svc__C': 0.09, 'svc__kernel': 'linear'}

normal: best score: 0.6827246814218645 {'svc__C': 5, gamma='scale'}

power: best score: 0.7182986809747373 {'svc__C': 1, gamma='scale'}
