### Multi-Class Classification 
* See [Alzheimer's Prediction](https://github.com/grantgasser/Alzheimers-Prediction) for more detail

In [9]:
import numpy as np
import sklearn as sk
import pandas as pd
import os

In [10]:
#Read data
dat = pd.read_csv('AD_Challenge_Training_Data_Clinical_Updated_7.22.2014/ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv')
dat.shape[0]

#Check NA values, 1 NA for imputed_genotype
dat.isnull().sum()

directory.id               0
Subject                    0
RID                        0
Image.Data.ID              0
Modality                   0
Visit                      0
Acq.Date                   0
DX.bl                      0
EXAMDATE                   0
AGE                        0
PTGENDER                   0
PTEDUCAT                   0
PTETHCAT                   0
PTRACCAT                   0
APOE4                      0
MMSE                       0
imputed_genotype           1
APOE Genotype              0
Dx Codes for Submission    0
dtype: int64

In [66]:
#Peek at data
dat.head(3)

Unnamed: 0,directory.id,Subject,RID,Image.Data.ID,Modality,Visit,Acq.Date,DX.bl,EXAMDATE,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype,Dx Codes for Submission
0,4702245ea294ce5d4e9b8a87027dfdf4,011_S_0003,3,32237,MRI,1,9/1/2005,AD,9/12/2005,81.3,Male,18,Not Hisp/Latino,White,1,20,True,34,AD
1,2e89e352af743597b2368c412e0f6de2,022_S_0004,4,64631,MRI,1,9/22/2005,LMCI,11/8/2005,67.5,Male,10,Hisp/Latino,White,0,27,False,33,MCI
2,90419199306997753de8042f1fd55e38,011_S_0005,5,32246,MRI,1,9/2/2005,CN,9/7/2005,73.7,Male,16,Not Hisp/Latino,White,0,29,True,33,CN


In [11]:
#Train (75%), Test (25%) Split
num_ex = dat.shape[0]
num_features = dat.shape[1]

m_train = int(num_ex*0.75)
m_test = num_ex - m_train

print('# of training examples:', m_train, '\n# of test examples:', m_test)

# of training examples: 471 
# of test examples: 157


In [134]:
from sklearn import datasets
iris = datasets.load_iris()
X_iris, y_iris = iris.data, iris.target
X_iris[1:3]

array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [85]:
X = dat
Y = dat['DX.bl']

#Remove unnecessary columns (features), remove first 9 columns and 'Dx codes for submission'
remove_columns = list(X.columns)[0:9]
remove_columns.append('Dx Codes for Submission')
print('Removing columns:', remove_columns)

X = X.drop(remove_columns, 1)

features = list(X.columns)
X.head(5)

Removing columns: ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit', 'Acq.Date', 'DX.bl', 'EXAMDATE', 'Dx Codes for Submission']


Unnamed: 0,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype
0,81.3,Male,18,Not Hisp/Latino,White,1,20,True,34
1,67.5,Male,10,Hisp/Latino,White,0,27,False,33
2,73.7,Male,16,Not Hisp/Latino,White,0,29,True,33
3,80.4,Female,13,Not Hisp/Latino,White,0,25,True,33
4,73.9,Female,12,Not Hisp/Latino,White,1,24,True,34


In [125]:
#Sklearn needs categorical features to be 1-hot encoded
numerical_vars = ['AGE', 'MMSE', 'PTEDUCAT']
cat_vars = list(set(features) - set(numerical_vars))

#for each categorical var, convert to 1-hot encoding
for var in cat_vars:
    print('Converting', var, 'to 1-hot encoding')
    print(pd.get_dummies(X[var]).head(3))
    


Converting imputed_genotype to 1-hot encoding
   False  True 
0      0      1
1      1      0
2      0      1
Converting APOE4 to 1-hot encoding
   0  1  2
0  0  1  0
1  1  0  0
2  1  0  0
Converting APOE Genotype to 1-hot encoding
   2,2  2,3  2,4  3,3  3,4  4,4
0    0    0    0    0    1    0
1    0    0    0    1    0    0
2    0    0    0    1    0    0
Converting PTGENDER to 1-hot encoding
   Female  Male
0       0     1
1       0     1
2       0     1
Converting PTETHCAT to 1-hot encoding
   Hisp/Latino  Not Hisp/Latino  Unknown
0            0                1        0
1            1                0        0
2            0                1        0
Converting PTRACCAT to 1-hot encoding
   Asian  Black  White
0      0      0      1
1      0      0      1
2      0      0      1


In [104]:
#Split data
X_train = X[0:m_train]
X_test = X[m_train:num_ex]

Y_train = Y[0:m_train]
Y_test = Y[m_train:num_ex]

#print('Shape of training data X_train:', X_train.shape, 'and Y_train:', Y_train.shape)
#print('Shape of test data X_test:', X_test.shape, 'and Y_test:', Y_test.shape)

Shape of training data X_train: (471, 9) and Y_train: (471,)
Shape of test data X_test: (157, 9) and Y_test: (157,)


## Anymore Preprocessing?
* Feature Normalization (for numerical features)
* Looking at distribution of features
* Model selection

## Stochastic Gradient Descent Classifier
* See [here](https://scikit-learn.org/stable/modules/sgd.html#classification) for more details

In [105]:
from sklearn.linear_model import SGDClassifier

### SGD Setup (SVM)
* SVM uses Hinge Loss
* Hinge Loss: $l_h = max(0, 1 - y * \mathbf{w^Tx}$)
* Regularization: $\lambda||\mathbf{w}||_2^2$

In [107]:
#Train classifier
clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X_train, Y_train)



ValueError: could not convert string to float: '3,4'