### Multi-Class Classification 
* See [Alzheimer's Prediction](https://github.com/grantgasser/Alzheimers-Prediction) for more detail

In [1]:
import numpy as np
import sklearn as sk
import pandas as pd
import os

In [154]:
#Read data
dat = pd.read_csv('AD_Challenge_Training_Data_Clinical_Updated_7.22.2014/ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv')

#Check NA values, 1 NA for imputed_genotype
#dat.isnull().sum()

In [155]:
#Peek at data
dat.head(3)

Unnamed: 0,directory.id,Subject,RID,Image.Data.ID,Modality,Visit,Acq.Date,DX.bl,EXAMDATE,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype,Dx Codes for Submission
0,4702245ea294ce5d4e9b8a87027dfdf4,011_S_0003,3,32237,MRI,1,9/1/2005,AD,9/12/2005,81.3,Male,18,Not Hisp/Latino,White,1,20,True,34,AD
1,2e89e352af743597b2368c412e0f6de2,022_S_0004,4,64631,MRI,1,9/22/2005,LMCI,11/8/2005,67.5,Male,10,Hisp/Latino,White,0,27,False,33,MCI
2,90419199306997753de8042f1fd55e38,011_S_0005,5,32246,MRI,1,9/2/2005,CN,9/7/2005,73.7,Male,16,Not Hisp/Latino,White,0,29,True,33,CN


In [156]:
#Train (75%), Test (25%) Split
num_ex = dat.shape[0]
num_features = dat.shape[1]

m_train = int(num_ex*0.75)
m_test = num_ex - m_train

print('# of training examples:', m_train, '\n# of test examples:', m_test)

# of training examples: 471 
# of test examples: 157


In [157]:
X = dat
Y = dat['DX.bl']

#Remove unnecessary columns (features), remove first 9 columns and 'Dx codes for submission'
remove_columns = list(X.columns)[0:9]
remove_columns.append('Dx Codes for Submission')
print('Removing columns:', remove_columns)

X = X.drop(remove_columns, 1)

features = list(X.columns)
X.head(5)

Removing columns: ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit', 'Acq.Date', 'DX.bl', 'EXAMDATE', 'Dx Codes for Submission']


Unnamed: 0,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype
0,81.3,Male,18,Not Hisp/Latino,White,1,20,True,34
1,67.5,Male,10,Hisp/Latino,White,0,27,False,33
2,73.7,Male,16,Not Hisp/Latino,White,0,29,True,33
3,80.4,Female,13,Not Hisp/Latino,White,0,25,True,33
4,73.9,Female,12,Not Hisp/Latino,White,1,24,True,34


In [158]:
#Sklearn needs categorical features to be 1-hot encoded
numerical_vars = ['AGE', 'MMSE', 'PTEDUCAT']
cat_vars = list(set(features) - set(numerical_vars))

#for each categorical var, convert to 1-hot encoding
for var in cat_vars:
    print('Converting', var, 'to 1-hot encoding')
    
    #get 1-hot and replace original column with the >= 2 categories as columns
    one_hot_df = pd.get_dummies(X[var])
    X = pd.concat([X, one_hot_df], axis=1)
    X = X.drop(var, 1)
    
X.head(4)

Converting PTGENDER to 1-hot encoding
Converting PTETHCAT to 1-hot encoding
Converting APOE4 to 1-hot encoding
Converting PTRACCAT to 1-hot encoding
Converting APOE Genotype to 1-hot encoding
Converting imputed_genotype to 1-hot encoding


Unnamed: 0,AGE,PTEDUCAT,MMSE,Female,Male,Hisp/Latino,Not Hisp/Latino,Unknown,0,1,...,Black,White,"2,2","2,3","2,4","3,3","3,4","4,4",False,True
0,81.3,18,20,0,1,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,1
1,67.5,10,27,0,1,1,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
2,73.7,16,29,0,1,0,1,0,1,0,...,0,1,0,0,0,1,0,0,0,1
3,80.4,13,25,1,0,0,1,0,1,0,...,0,1,0,0,0,1,0,0,0,1


### Remember:
* PTGENDER: (Male, Female)
* PTETHCAT: (Hisp/Latino, Not Hisp/Latino, Unknown)
* PTRACCAT: (White, Black, Asian)
* APOE4: (1, 0)
* APOE4 Genotype: (22, 23, 24, 33, 34, 44)
* imputed_genotype (True, False)

In [159]:
#Convert to numpy array
X = np.array(X)
sanity_check = 0

#Normalize numerical variables to speed up convergence
for i in range(3):
    mean = np.mean(X[:, i])
    sd = np.std(X[:, i])
    print('\nNormalizing', numerical_vars[i], 'with mean=', format(mean, '.2f'), 'and sd=', format(sd, '.2f'))
    
    X[:, i] = (X[:, i] - mean) / sd
    sanity_check += np.mean(X[:, i])
    
print('\nSanity Check. Sum of all the means should be near 0:', sanity_check)


Normalizing AGE with mean= 75.17 and sd= 6.68

Normalizing MMSE with mean= 15.62 and sd= 2.98

Normalizing PTEDUCAT with mean= 26.92 and sd= 2.56

Sanity Check. Sum of all the means should be near 0: 9.871791989660626e-16


In [160]:
#Split data
X_train = X[0:m_train]
X_test = X[m_train:num_ex]

Y_train = Y[0:m_train]
Y_test = Y[m_train:num_ex]

print('Shape of training data X_train:', X_train.shape, 'and Y_train:', Y_train.shape)
print('Shape of test data X_test:', X_test.shape, 'and Y_test:', Y_test.shape)

Shape of training data X_train: (471, 22) and Y_train: (471,)
Shape of test data X_test: (157, 22) and Y_test: (157,)


## Anymore Preprocessing?
* Looking at distribution of features
* Model selection

## SVM Classifier
* See [here](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC) for more details

In [161]:
from sklearn.svm import LinearSVC

### SVM Setup
* SVM uses Hinge Loss
* Hinge Loss: $l_h = max(0, 1 - y * \mathbf{w^Tx}$)
* Regularization: $\lambda||\mathbf{w}||_2^2$

In [162]:
#Train classifier
clf = LinearSVC(penalty='l2', loss='hinge', multi_class='ovr', max_iter = 100000)  #ovr: 'One v. Rest/All'
clf.fit(X_train, Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=100000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [163]:
#Make predictions on test data
clf_predictions = clf.predict(X_test)
accuracy_vect = clf_predictions == Y_test

In [164]:
#Evaluate accuracy
print('Accuracy: ', format(np.sum(accuracy_vect)/m_test, '.2%'))

Accuracy:  56.05%


In [165]:
#56% Pretty bad, 33% is random,, circle back