In [9]:
import numpy as np
import matplotlib.pylab as plt

<h3> Alzheimer's Disease Classification </h3>

In [12]:
features = np.loadtxt('data/features_ad_classification.txt')
labels = np.loadtxt('data/labels_ad_classification.txt')
print("Number of subjects: {}".format(features.shape[0]))
print("Number of features: {}".format(features.shape[1]))
print("Number of AD cases: {}".format(np.sum(labels)))
print("Number of CN cases: {}".format(np.sum(1-labels)))

Number of subjects: 290
Number of features: 10242
Number of AD cases: 145.0
Number of CN cases: 145.0


In [15]:
# import the required function to compute classification accuracy
from sklearn.metrics import accuracy_score
from sklearn import svm
svml = svm.SVC(kernel='linear')

# Computing prediction error on the training set. 
svml.fit(features, labels) # train on all the data
preds = svml.predict(features)
print("Prediction accuracy on the training set: {}\n".format(accuracy_score(labels,preds)))

# import the required function to perform 5-fold stratified cross-validation
# in stratified K-fold cross validation in each fold the ratio of the 
# number of different classes is the same as the entire dataset. 
from sklearn.model_selection import StratifiedKFold

# creating an object to create partitions for the 5 fold cross validation
numFolds = 5
skf = StratifiedKFold(n_splits=numFolds)

# creating a vector to hold accuracies of different folds: 
acc_vec = np.zeros(numFolds)
# in this for loop we go over different partitions. 
n = 0
for trainind, testind in skf.split(features, labels):
    # training both classification models using the training partitions of the dataset. 
    svml.fit(features[trainind,:], labels[trainind])
    
    # predictions in the test partition of each fold
    preds_cv = svml.predict(features[testind,:])
    
    # computing accuracy for the test partitions
    acc_vec[n] = accuracy_score(labels[testind], preds_cv)
    n += 1

print("Accuracies at different folds:")
print("=============================")
print("Linear SVM: {}".format(acc_vec))
print("\n")
print("Generalization accuracy estimates:")
print("=============================")
print("Linear SVM: {}".format(np.mean(acc_vec)))


Prediction accuracy on the training set: 1.0

Accuracies at different folds:
Linear SVM: [ 0.86206897  0.93103448  0.9137931   0.75862069  0.84482759]


Generalization accuracy estimates:
Linear SVM: 0.8620689655172414


<h3> Age Regression </h3>

In [23]:
# features are saved in a matrix
# note that here we read a csv file with np.loadtxt - this is another alternative to reading csv files
features = np.loadtxt('data/features_age_regression.csv', delimiter=',').T
# labels are saved as a vector
labels = np.loadtxt('data/labels_age_regression.csv', delimiter=',')
# printing information on the dataset
print("Number of subjects: {}".format(features.shape[0]))
print("Number of features: {}".format(features.shape[1]))
print("Mean age in the dataset: {}".format(np.mean(labels)))
print("Min / Max age in the dataset: {}/{}".format(np.min(labels), np.max(labels)))

Number of subjects: 335
Number of features: 45
Mean age in the dataset: 43.86865671641791
Min / Max age in the dataset: 18.0/94.0


In [27]:
# import the required function to compute classification accuracy
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
lasso = linear_model.Lasso()

# Computing prediction error on the training set. 
lasso.fit(features, labels) # train on all the data
preds = lasso.predict(features)
print("RMSE on the training set: {}\n".format(np.sqrt(mean_squared_error(labels,preds))))
print("Pearson's correlation coefficient on the training set: {}\n".format(np.corrcoef(labels,preds)[0,1]))

# import the required function to perform 5-fold stratified cross-validation
# in stratified K-fold cross validation in each fold the ratio of the 
# number of different classes is the same as the entire dataset. 
from sklearn.model_selection import KFold

# creating an object to create partitions for the 5 fold cross validation
numFolds = 5
skf = KFold(n_splits=numFolds)

# creating a vector to hold accuracies of different folds: 
rmse_vec = np.zeros(numFolds)
r_vec = np.zeros(numFolds)
# in this for loop we go over different partitions. 
n = 0
for trainind, testind in skf.split(features, labels):
    # training both classification models using the training partitions of the dataset. 
    lasso.fit(features[trainind,:], labels[trainind])
    
    # predictions in the test partition of each fold
    preds_cv = lasso.predict(features[testind,:])
    
    # computing accuracy for the test partitions
    rmse_vec[n] = np.sqrt(mean_squared_error(labels[testind], preds_cv))
    r_vec[n] = np.corrcoef(labels[testind],preds_cv)[0,1]
                          
    n += 1

print("Accuracies at different folds:")
print("=============================")
print("LASSO - RMSE: {}".format(rmse_vec))
print("LASSO - r: {}".format(r_vec))
print("\n")
print("Generalization accuracy estimates:")
print("=============================")
print("LASSO - RMSE: {}".format(np.mean(rmse_vec)))
print("LASSO - r: {}".format(np.mean(r_vec)))


RMSE on the training set: 13.274767541908702

Pearson's correlation coefficient on the training set: 0.8297023861280494

Accuracies at different folds:
LASSO - RMSE: [ 12.45859544  13.56866578  14.68075349  15.8606269   11.86957188]
LASSO - r: [ 0.83871396  0.82190906  0.82123691  0.75776068  0.84400263]


Generalization accuracy estimates:
LASSO - RMSE: 13.687642695718148
LASSO - r: 0.8167246479485119
