In [2]:
#Donald Dunagan
#811-648-053
#28 July 2018
#CSCI 6380 Term Project
#Extending the tracking of Parkinson’s symptoms 
#through telemonitored speech samples via advanced statistical methods

# Set up environment

In [3]:
#import libraries

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from scipy import stats

In [4]:
#import Parkinson's Dataset and visualize

PDDF =  pd.read_csv("C:\\Users\\gray\\Desktop\\CSCI6380\\Term_Paper\\Parkinson's_Data.csv")
print ('The shape of the dataframe is:',PDDF.shape)
PDDF.head()

The shape of the dataframe is: (5875, 22)


Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


#### Initial shuffling and splitting

In [5]:
#shuffle the data
PDDF = shuffle(PDDF)

#isolate features
features_to_drop = ['subject#','age','sex','test_time','motor_UPDRS','total_UPDRS']
just_features = PDDF.drop(features_to_drop,axis=1)
    
#isolate motor_UPDRS
just_motor = PDDF.motor_UPDRS
   
#isolate total_UPDRS
just_total = PDDF.total_UPDRS

#split into training and testing data
training_features = just_features[:5287]
testing_features = just_features[5287:]
    
training_motor = just_motor[:5287]
testing_motor = just_motor[5287:]
    
training_total = just_total[:5287]
testing_total = just_total[5287:] 

#### A function for shuffling and splitting the data

In [6]:
def shuffleSplit():
    '''
    Shuffles and splits the DataFrame     
    '''
    global PDDF
    
    #shuffle the data
    PDDF = shuffle(PDDF)

    #isolate features
    features_to_drop = ['subject#','age','sex','test_time','motor_UPDRS','total_UPDRS']
    global just_features
    just_features = PDDF.drop(features_to_drop,axis=1)
    
    #isolate motor_UPDRS
    global just_motor
    just_motor = PDDF.motor_UPDRS
   
    #isolate total_UPDRS
    global just_total
    just_total = PDDF.total_UPDRS

    #split into training and testing data
    global training_features
    training_features = just_features[:5287]
    global testing_features
    testing_features = just_features[5287:]
    
    global training_motor
    training_motor = just_motor[:5287]
    global testing_motor
    testing_motor = just_motor[5287:]
    
    global training_total
    training_total = just_total[:5287]
    global testing_total
    testing_total = just_total[5287:]  
    
    return

In [7]:
#cross validation needs to return the mean absolute error (MAE) for each fold
#so i have to make a custom scoring object

#create scorer object
myScorer = make_scorer(mean_absolute_error)

# CARTS

## First, get baselines for untuned trees

### 10-fold cross validated training MAE of an untuned CART targeting motor_UPDRS

In [7]:
#10-fold cross validated untuned regression tree targeting motor_UPDRS

tree = DecisionTreeRegressor()

#10-fold CV
vals = cross_val_score(tree,training_features,training_motor,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 6.601175357581009


### 10-fold cross validated training MAE of an untuned CART targeting total_UPDRS

In [8]:
#10-fold cross validated untuned regression tree targeting total_UPDRS

tree = DecisionTreeRegressor()

#10-fold CV
vals = cross_val_score(tree,training_features,training_total,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 8.68000020244386


### Tuneable parameters

In [9]:
#regression tree parameters which can be tuned
tree = DecisionTreeRegressor()
print("Tunable parameters: ")
tree.get_params()

Tunable parameters: 


{'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

## Tuning CART targeting motor_UPDRS

In [10]:
#instantiate a generic CART and use Grid Search to approximate optimal number of leaf nodes
paramsToTune = {
    'max_leaf_nodes': [80, 85, 90, 95, 100],
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_leaf_nodes': 80}


In [11]:
#instantiate a generic CART and use Grid Search to approximate optimal max depth
paramsToTune = {
    'max_depth': np.linspace(1,10,10, endpoint=True)
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_depth': 6.0}


In [12]:
#instantiate a generic CART and use Grid Search to approximate optimal minimum split
paramsToTune = {
    'min_samples_split':np.linspace(0.1, 1.0, 10, endpoint=True)
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'min_samples_split': 0.1}


In [13]:
#instantiate a generic CART and use Grid Search to approximate optimal minimum leaf size
paramsToTune = {
    'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'min_samples_leaf': 0.1}


In [14]:
#instantiate a generic CART and use Grid Search to approximate optimal max number of features
paramsToTune = {
        'max_features': [10,11,12,13,14,15,16]
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_features': 15}


In [15]:
#instantiate a generic CART and use Grid Search to find the optimal parameters
paramsToTune = {
    'max_depth': [2,3,4,5,6,7],
    'max_leaf_nodes':[130,140,150,160,170,180,190],
    'max_features': [10,11,12,13,14,15,16]
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print (GS.best_params_)

{'max_depth': 5, 'max_features': 11, 'max_leaf_nodes': 140}


### 10-fold cross validated training MAE of a tuned CART targeting motor_UPDRS

In [31]:
#train a CART with these tuned hyperparameters

tree = DecisionTreeRegressor(max_depth=5,max_features=11,max_leaf_nodes=140)

#10-fold CV
vals = cross_val_score(tree,training_features,training_motor,cv=10,scoring=myScorer)
print("The tuned CART has a 10-fold CV'd training motor_UPDRS MAE of {}".format(vals.mean()))

The tuned CART has a 10-fold CV'd training motor_UPDRS MAE of 6.067455367559535


## Tuning CART targeting total_UPDRS

In [16]:
#instantiate a generic CART and use Grid Search to approximate optimal number of leaf nodes
paramsToTune = {
    'max_leaf_nodes': [80, 85, 90, 95, 100],
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_leaf_nodes': 80}


In [17]:
#instantiate a generic CART and use Grid Search to approximate optimal max depth
paramsToTune = {
    'max_depth': np.linspace(1,10,10, endpoint=True)
}

tree = DecisionTreeRegressor()
GS= GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_depth': 6.0}


In [18]:
#instantiate a generic CART and use Grid Search to approximate optimal minimum split
paramsToTune = {
    'min_samples_split':np.linspace(0.1, 1.0, 10, endpoint=True)
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'min_samples_split': 0.1}


In [19]:
#instantiate a generic CART and use Grid Search to approximate optimal minimum leaf size
paramsToTune = {
    'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'min_samples_leaf': 0.1}


In [20]:
#instantiate a generic CART and use Grid Search to approximate optimal max number of features
paramsToTune = {
        'max_features': [10,11,12,13,14,15,16]
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_features': 11}


In [21]:
#instantiate a generic CART and use Grid Search to find the optimal parameters
paramsToTune = {
    'max_depth': [4,5,6,7,8,9,10,11],
    'max_leaf_nodes':[20,30,40,50,60],
    'max_features': [12,13,14,15,16]
}

tree = DecisionTreeRegressor()
GS = GridSearchCV(estimator=tree, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print (GS.best_params_)

{'max_depth': 7, 'max_features': 14, 'max_leaf_nodes': 40}


### 10-fold cross validated training MAE of a tuned CART targeting total_UPDRS

In [22]:
#train a CART with these tuned hyperparameters

tree = DecisionTreeRegressor(max_depth=7,max_features=14,max_leaf_nodes=40)

#10-fold CV
vals = cross_val_score(tree,training_features,training_total,cv=10,scoring=myScorer)
print("The tuned CART has a 10-fold CV'd training training_total MAE of {}".format(vals.mean()))

The tuned CART has a 10-fold CV'd training training_total MAE of 7.673857197177156


## Model testing
- Shuffle the data and split into training and testing (90%-10%)
- Fit the tuned CART to the training data and then calculate testing MAE on the test data
- Repeat 1,000 times, logging the MAE for each repetition

### Tuned CART targeting motor_UPDRS

In [32]:
motorCartErrs = []

for i in range(1000):
    
    #shuffle the data
    shuffleSplit()
    
    #train a CART tree with the tuned hyperparameters
    tree = DecisionTreeRegressor(max_depth=5,max_features=11,max_leaf_nodes=140)
    tree.fit(training_features,training_motor)
    #make predictions
    predicts = tree.predict(testing_features)

    #calculate MAE and add to list of errors
    motorCartErrs.append(mean_absolute_error(predicts,testing_motor))

In [1]:
confInterval = stats.norm.interval(0.65, loc=np.mean(motorCartErrs), scale=np.std(motorCartErrs))
val = np.mean(motorCartErrs)-confInterval[0]
print("After 1,000  runs, the test MAE for motor_UPDRS is {} ± {} points".format(np.mean(motorCartErrs),val))

NameError: name 'stats' is not defined

### Tuned CART targeting total_UPDRS

In [26]:
totalCartErrs = []

for i in range(1000):
    
    #shuffle the data
    shuffleSplit()
    
    #train a CART tree with the tuned hyperparameters
    tree = DecisionTreeRegressor(max_depth=7,max_features=14,max_leaf_nodes=40)
    tree.fit(training_features,training_total)
    #make predictions
    predicts = tree.predict(testing_features)

    #calculate MAE and add to list of errors
    totalCartErrs.append(mean_absolute_error(predicts,testing_total))

In [41]:
confInterval = stats.norm.interval(0.65, loc=np.mean(totalCartErrs), scale=np.std(totalCartErrs))
val = np.mean(totalCartErrs)-confInterval[0]
print("After 1,000 CV runs, the test MAE for total_UPDRS is {} ± {} points".format(np.mean(totalCartErrs),val))

After 1,000 CV runs, the test MAE for total_UPDRS is 7.628095453003876 ± 0.2236975975263613 points


# Random Forests

## First, get baselines for untuned RFs

### 10-fold cross validated training MAE of an untuned RF targeting motor_UPDRS

In [7]:
#10-fold cross validated untuned RF targeting motor_UPDRS

forest = RandomForestRegressor()

#10-fold CV
vals = cross_val_score(forest,training_features,training_motor,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 5.351676342026837


### 10-fold cross validated training MAE of an untuned RF targeting total_UPDRS

In [8]:
#10-fold cross validated untuned RF targeting motor_UPDRS

forest = RandomForestRegressor()

#10-fold CV
vals = cross_val_score(forest,training_features,training_total,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 6.845199657966693


### Tuneable parameters

In [9]:
#RF parameters which can be tuned
forest = RandomForestRegressor()
print("Tunable parameters: ")
forest.get_params()

Tunable parameters: 


{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Tuning RF targeting motor_UPDRS

In [10]:
#instantiate a generic RF and use Grid Search to approximate optimal max depth
paramsToTune = {
    'max_depth': [80, 85, 90, 95, 100],
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_depth': 100}


In [11]:
#instantiate a generic RF and use Grid Search to approximate optimal max number of leaf nodes
paramsToTune = {
    'max_leaf_nodes': [220,230,240,250,260],
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_leaf_nodes': 260}


In [12]:
#instantiate a generic RF and use Grid Search to approximate optimal max number of features
paramsToTune = {
    'max_features': [11,12,13,14,15,16],
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_features': 14}


In [13]:
#instantiate a generic RF and use Grid Search to approximate optimal max number of estimators in the forest
paramsToTune = {
    'n_estimators': [10,50,100],
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'n_estimators': 100}


In [12]:
#instantiate a generic RF and use Grid Search to find the optimal parameters
paramsToTune = {
    'max_depth': [45,50,55,60,65,70,75],
    'max_leaf_nodes':[230,240,250,260,270,280,290],
    'max_features': [12,13,14,15,16]
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print (GS.best_params_)

{'max_depth': 65, 'max_features': 16, 'max_leaf_nodes': 250}


### 10-fold cross validated training MAE of a tuned RF targeting motor_UPDRS

In [14]:
#train an RF with these tuned hyperparameters

forest = RandomForestRegressor(max_depth=65,max_features=16,max_leaf_nodes=250,n_estimators=10)

#10-fold CV
vals = cross_val_score(forest,training_features,training_motor,cv=10,scoring=myScorer)
print("The tuned RF has a 10-fold CV'd training motor_UPDRS MAE of {}".format(vals.mean()))

The tuned RF has a 10-fold CV'd training motor_UPDRS MAE of 5.33584851115302


## Tuning RF targeting total_UPDRS

In [19]:
#instantiate a generic RF and use Grid Search to approximate optimal max depth
paramsToTune = {
    'max_depth': [80,85,90,95],
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_depth': 95}


In [20]:
#instantiate a generic RF and use Grid Search to approximate optimal max number of leaf nodes
paramsToTune = {
    'max_leaf_nodes': [780,790,800,810,820]
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_leaf_nodes': 790}


In [21]:
#instantiate a generic RF and use Grid Search to approximate optimal max number of features
paramsToTune = {
    'max_features': [11,12,13,14,15,16],
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'max_features': 14}


In [13]:
#instantiate a generic RF and use Grid Search to find the optimal parameters
paramsToTune = {
    'max_depth': [10,15,20,25,30,35,40],
    'max_leaf_nodes':[630,640,650,660,670,680,690],
    'max_features': [12,13,14,15,16]
}

forest = RandomForestRegressor()
GS = GridSearchCV(estimator=forest, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print (GS.best_params_)

{'max_depth': 20, 'max_features': 15, 'max_leaf_nodes': 650}


### 10-fold cross validated training MAE of a tuned RF targeting motor_UPDRS

In [15]:
#train an RF with these tuned hyperparameters

forest = RandomForestRegressor(max_depth=20,max_features=15,max_leaf_nodes=650,n_estimators=10)

#10-fold CV
vals = cross_val_score(forest,training_features,training_total,cv=10,scoring=myScorer)
print("The tuned RF has a 10-fold CV'd training total_UPDRS MAE of {}".format(vals.mean()))

The tuned RF has a 10-fold CV'd training total_UPDRS MAE of 6.767816367379036


## Model testing
- Shuffle the data and split into training and testing (90%-10%)
- Fit the tuned RF to the training data and then calculate testing MAE on the test data
- Repeat 100 times, logging the MAE for each repetition

### Tuned RF targeting motor_UPDRS

In [16]:
motorRfErrs = []

for i in range(100):
    
    #shuffle the data
    shuffleSplit()
    
    #train an RF with the tuned hyperparameters
    forest = RandomForestRegressor(max_depth=65,max_features=16,max_leaf_nodes=250,n_estimators=100)
    forest.fit(training_features,training_motor)
    #make predictions
    predicts = forest.predict(testing_features)

    #calculate MAE and add to list of errors
    motorRfErrs.append(mean_absolute_error(predicts,testing_motor))

In [17]:
confInterval = stats.norm.interval(0.65, loc=np.mean(motorRfErrs), scale=np.std(motorRfErrs))
val = np.mean(motorRfErrs)-confInterval[0]
print("After 100 CV runs, the test MAE for motor_UPDRS is {} ± {} points".format(np.mean(motorRfErrs),val))

After 100 CV runs, the test MAE for motor_UPDRS is 5.17460477239424 ± 0.12504720567955552 points


### Tuned RF targeting total_UPDRS

In [12]:
totalRfErrs = []

for i in range(100):
    
    #shuffle the data
    shuffleSplit()
    
    #train an RF with the tuned hyperparameters
    forest = RandomForestRegressor(max_depth=20,max_features=15,max_leaf_nodes=650,n_estimators=100)
    forest.fit(training_features,training_total)
    #make predictions
    predicts = forest.predict(testing_features)

    #calculate MAE and add to list of errors
    totalRfErrs.append(mean_absolute_error(predicts,testing_total))

In [16]:
confInterval = stats.norm.interval(0.65, loc=np.mean(totalRfErrs), scale=np.std(totalRfErrs))
val = np.mean(totalRfErrs)-confInterval[0]
print("After 100 CV runs, the test MAE for total_UPDRS is {} ± {} points".format(np.mean(totalRfErrs),val))

After 100 CV runs, the test MAE for total_UPDRS is 6.466393946774503 ± 0.20589660842942692 points


# AdaBoosted Decision Trees

## First, get baselines for untuned ABDTs

### 10-fold cross validated training MAE of an untuned ABDT targeting motor_UPDRS

In [29]:
#10-fold cross validated untuned ABDT targeting motor_UPDRS

abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

#10-fold CV
vals = cross_val_score(abr,training_features,training_motor,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 4.682722270407288


### 10-fold cross validated training MAE of an untuned ABDT targeting total_UPDRS

In [30]:
#10-fold cross validated untuned ABDT targeting motor_UPDRS

abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

#10-fold CV
vals = cross_val_score(abr,training_features,training_total,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 5.960161880012317


### Tuneable parameters

In [31]:
#ABDT parameters which can be tuned
abr = AdaBoostRegressor()
print("Tunable parameters: ")
abr.get_params()

Tunable parameters: 


{'base_estimator': None,
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': None}

## Tuning ABDT targeting motor_UPDRS

In [32]:
#instantiate ABDT and use Grid Search to approximate optimal base estimator
paramsToTune = {
    'base_estimator': [DecisionTreeRegressor(max_depth=1),DecisionTreeRegressor(max_depth=2),DecisionTreeRegressor(max_depth=3),DecisionTreeRegressor(max_depth=4)],
}

abr = AdaBoostRegressor()
GS = GridSearchCV(estimator=abr, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'base_estimator': DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')}


In [33]:
#instantiate ABDT and use Grid Search to approximate optimal learning rate
paramsToTune = {
    'learning_rate': [1,2,3,4,5],
}

abr = AdaBoostRegressor()
GS = GridSearchCV(estimator=abr, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'learning_rate': 1}


In [34]:
#instantiate ABDT and use Grid Search to approximate optimal learning rate
paramsToTune = {
    'n_estimators': [50,100,150],
}

abr = AdaBoostRegressor()
GS = GridSearchCV(estimator=abr, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'n_estimators': 50}


In [35]:
#instantiate ABDT and use Grid Search to find the optimal parameters
paramsToTune = {
    'base_estimator': [DecisionTreeRegressor(max_depth=1),DecisionTreeRegressor(max_depth=2),DecisionTreeRegressor(max_depth=3),DecisionTreeRegressor(max_depth=4)],
    'learning_rate': [1,2,3],
    'n_estimators': [15,20,30]
}

abr = AdaBoostRegressor()
GS = GridSearchCV(estimator=abr, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'base_estimator': DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'), 'learning_rate': 1, 'n_estimators': 15}


### 10-fold cross validated training MAE of an ABDT targeting motor_UPDRS

In [36]:
#train an ABDT

abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

#10-fold CV
vals = cross_val_score(abr,training_features,training_motor,cv=10,scoring=myScorer)
print("The ABDT has a 10-fold CV'd training motor_UPDRS MAE of {}".format(vals.mean()))

The ABDT has a 10-fold CV'd training motor_UPDRS MAE of 4.700517960810849


### 10-fold cross validated training MAE of an ABDT targeting total_UPDRS

In [None]:
#train an ABDT

abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

#10-fold CV
vals = cross_val_score(abr,training_features,training_total,cv=10,scoring=myScorer)
print("The ABDT has a 10-fold CV'd training total_UPDRS MAE of {}".format(vals.mean()))

## Model testing
- Shuffle the data and split into training and testing (90%-10%)
- Fit the tuned ABDT to the training data and then calculate testing MAE on the test data
- Repeat 100 times, logging the MAE for each repetition

In [8]:
motorAbdtErrs = []

for i in range(100):
    
    #shuffle the data
    shuffleSplit()
    
    #train an ABDT
    abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())
    abr.fit(training_features,training_motor)
    #make predictions
    predicts = abr.predict(testing_features)

    #calculate MAE and add to list of errors
    motorAbdtErrs.append(mean_absolute_error(predicts,testing_motor))

In [13]:
confInterval = stats.norm.interval(0.65, loc=np.mean(motorAbdtErrs), scale=np.std(motorAbdtErrs))
val = np.mean(motorAbdtErrs)-confInterval[0]
print("After 100 CV runs, the test MAE for motor_UPDRS is {} ± {} points".format(np.mean(motorAbdtErrs),val))

After 100 CV runs, the test MAE for motor_UPDRS is 4.7014933667656225 ± 0.15777313384950986 points


In [9]:
totalAbdtErrs = []

for i in range(100):
    
    #shuffle the data
    shuffleSplit()
    
    #train an ABDT
    abr = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())
    abr.fit(training_features,training_total)
    #make predictions
    predicts = abr.predict(testing_features)

    #calculate MAE and add to list of errors
    totalAbdtErrs.append(mean_absolute_error(predicts,testing_total))

In [14]:
confInterval = stats.norm.interval(0.65, loc=np.mean(totalAbdtErrs), scale=np.std(totalAbdtErrs))
val = np.mean(totalAbdtErrs)-confInterval[0]
print("After 100 CV runs, the test MAE for total_UPDRS is {} ± {} points".format(np.mean(totalAbdtErrs),val))

After 100 CV runs, the test MAE for motor_UPDRS is 5.9492480235322125 ± 0.2122657693193828 points


# K-Nearest Neighbors

## First, get baselines for untuned KNNS

### 10-fold cross validated training MAE of an untuned KNN targeting motor_UPDRS

In [7]:
#10-fold cross validated untuned KNN targeting motor_UPDRS

KNN = KNeighborsRegressor()

#10-fold CV
vals = cross_val_score(KNN,training_features,training_motor,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 6.549629210459988


### 10-fold cross validated training MAE of an untuned KNN targeting total_UPDRS

In [8]:
#10-fold cross validated untuned KNN targeting motor_UPDRS

KNN = KNeighborsRegressor()

#10-fold CV
vals = cross_val_score(KNN,training_features,training_total,cv=10, scoring=myScorer)
print("MAE: {}".format(vals.mean()))

MAE: 8.36904170352151


### Tuneable parameters

In [9]:
#KNN parameters which can be tuned
KNN = KNeighborsRegressor()
print("Tunable parameters: ")
KNN.get_params()

Tunable parameters: 


{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

## Tuning KNN targeting motor_UPDRS

In [10]:
#instantiate KNN and use Grid Search to approximate optimal base algorithm
paramsToTune = {
    'algorithm': ['auto','ball_tree','kd_tree','brute'],
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'algorithm': 'auto'}


In [11]:
#instantiate KNN and use Grid Search to approximate optimal weighting
paramsToTune = {
    'weights': ['uniform','distance'],
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'weights': 'uniform'}


In [12]:
#instantiate KNN and use Grid Search to approximate optimal num_neighbors
paramsToTune = {
    'n_neighbors': [18,19,20,21,22]
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'n_neighbors': 18}


In [15]:
#instantiate KNN and use Grid Search to find the optimal parameters
paramsToTune = {
    'n_neighbors': [18,19,20,21,22,],
    'weights': ['uniform','distance'],
    'algorithm': ['auto','ball_tree','kd_tree','brute']
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_motor)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'algorithm': 'auto', 'n_neighbors': 21, 'weights': 'distance'}


### 10-fold cross validated training MAE of tuned KNN targeting motor_UPDRS

In [24]:
#train KNN

KNN = KNeighborsRegressor(n_neighbors=21,weights='distance')

#10-fold CV
vals = cross_val_score(KNN,training_features,training_motor,cv=10,scoring=myScorer)
print("The KNN has a 10-fold CV'd training motor_UPDRS MAE of {}".format(vals.mean()))

The KNN has a 10-fold CV'd training motor_UPDRS MAE of 6.419689539033969


## Tuning KNN targeting total_UPDRS

In [17]:
#instantiate KNN and use Grid Search to approximate optimal base algorithm
paramsToTune = {
    'algorithm': ['auto','ball_tree','kd_tree','brute'],
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'algorithm': 'auto'}


In [18]:
#instantiate KNN and use Grid Search to approximate optimal weighting
paramsToTune = {
    'weights': ['uniform','distance'],
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'weights': 'distance'}


In [19]:
#instantiate KNN and use Grid Search to approximate optimal num_neighbors
paramsToTune = {
    'n_neighbors': [18,19,20,21,22]
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'n_neighbors': 19}


In [20]:
#instantiate KNN and use Grid Search to find the optimal parameters
paramsToTune = {
    'n_neighbors': [18,19,20,21,22,],
    'weights': ['uniform','distance'],
    'algorithm': ['auto','ball_tree','kd_tree','brute']
}

KNN = KNeighborsRegressor()
GS = GridSearchCV(estimator=KNN, param_grid=paramsToTune, cv=10)
GS.fit(training_features,training_total)
print ('Approximate optimum: {}'.format(GS.best_params_))

Approximate optimum: {'algorithm': 'auto', 'n_neighbors': 19, 'weights': 'distance'}


### 10-fold cross validated training MAE of tuned KNN targeting motor_UPDRS

In [21]:
#train KNN

KNN = KNeighborsRegressor(n_neighbors=19,weights='distance')

#10-fold CV
vals = cross_val_score(KNN,training_features,training_total,cv=10,scoring=myScorer)
print("The KNN has a 10-fold CV'd training total_UPDRS MAE of {}".format(vals.mean()))

The KNN has a 10-fold CV'd training total_UPDRS MAE of 8.100647739120332


## Model testing
- Shuffle the data and split into training and testing (90%-10%)
- Fit the tuned KNN to the training data and then calculate testing MAE on the test data
- Repeat 1000 times, logging the MAE for each repetition

In [8]:
motorKnnErrs = []

for i in range(1000):
    
    #shuffle the data
    shuffleSplit()
    
    #train KNN
    KNN = KNeighborsRegressor(n_neighbors=21,weights='distance')
    KNN.fit(training_features,training_motor)
    #make predictions
    predicts = KNN.predict(testing_features)

    #calculate MAE and add to list of errors
    motorKnnErrs.append(mean_absolute_error(predicts,testing_motor))

In [9]:
confInterval = stats.norm.interval(0.65, loc=np.mean(motorKnnErrs), scale=np.std(motorKnnErrs))
val = np.mean(motorKnnErrs)-confInterval[0]
print("After 1,000 CV runs, the test MAE for motor_UPDRS is {} ± {} points".format(np.mean(motorKnnErrs),val))

After 1,000 CV runs, the test MAE for motor_UPDRS is 6.380328352912919 ± 0.15703171816412986 points


In [10]:
totalKnnErrs = []

for i in range(1000):
    
    #shuffle the data
    shuffleSplit()
    
    #train KNN
    KNN = KNeighborsRegressor(n_neighbors=19,weights='distance')
    KNN.fit(training_features,training_total)
    #make predictions
    predicts = KNN.predict(testing_features)

    #calculate MAE and add to list of errors
    totalKnnErrs.append(mean_absolute_error(predicts,testing_total))

In [11]:
confInterval = stats.norm.interval(0.65, loc=np.mean(totalKnnErrs), scale=np.std(totalKnnErrs))
val = np.mean(totalKnnErrs)-confInterval[0]
print("After 1,000 CV runs, the test MAE for total_UPDRS is {} ± {} points".format(np.mean(totalKnnErrs),val))

After 1,000 CV runs, the test MAE for motor_UPDRS is 8.061919189090547 ± 0.2129532679595556 points
