In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import csv
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit


In [196]:
# Model-specific imports:
from sklearn.tree  import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [115]:
inputData = pd.read_excel('../Volumetric_features.xlsx', engine='openpyxl').sample(frac = 1)
inputData.head()

Unnamed: 0,S.No,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus,Left-Caudate,Left-Putamen,Left-Pallidum,3rd-Ventricle,...,rh_supramarginal_thickness,rh_frontalpole_thickness,rh_temporalpole_thickness,rh_transversetemporal_thickness,rh_insula_thickness,rh_MeanThickness_thickness,BrainSegVolNotVent.2,eTIV.1,Age,dataset
3085,3086,10348.5,497.7,15116.0,63338.9,8737.1,4031.6,6053.7,1942.7,799.3,...,2.687,2.844,3.36,2.285,2.752,2.5941,1274852,1589753.137,21,5
1542,1543,29241.1,871.6,12983.8,61041.1,6683.4,4054.2,3900.9,2046.4,2830.8,...,2.604,2.709,3.648,2.353,2.732,2.4263,1048077,1622833.977,73,2
3549,3550,2731.8,177.4,15674.8,56194.0,8622.8,3142.0,5210.1,2125.6,922.9,...,2.504,3.082,3.762,2.302,3.189,2.54743,1138002,1480006.928,23,8
1775,1776,5647.7,443.5,19173.5,57051.3,8768.7,3027.8,4700.5,1898.0,934.0,...,2.545,2.73,3.814,2.574,2.938,2.4998,1227474,1475485.77,41,4
3204,3205,10389.0,731.1,12616.9,43590.4,6296.3,3406.4,4582.9,1685.4,1235.8,...,2.359,2.978,3.776,1.98,2.87,2.44898,959136,1224484.476,58,8


In [138]:
# select the input brain volume vector (X) and the output age vector (y)
X = inputData.drop(columns=['S.No', 'Age', 'dataset'])
y = inputData['Age']

# split the X and y vectors into training and testing dataset
# we're using 80% for training and 20% for testing, but this can change
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [187]:
# instantiate a decision tree model using sklearn:
dt_model = DecisionTreeRegressor(criterion='mse',max_depth=6, min_samples_leaf=4, min_samples_split=2)


# fit the model to the training data
dt_model.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [189]:
# check the model's performance on unseen, test data
dt_predictions = dt_model.predict(X_test)

# compare the predictions to the known values (y_test)
print('r square value:', metrics.r2_score(y_test, dt_predictions))
print('mean absolute error (years):', metrics.mean_absolute_error(y_test, dt_predictions))
print('mean square error:', metrics.mean_squared_error(y_test, dt_predictions))
print('root mean sqaure error (years):', np.sqrt(metrics.mean_squared_error(y_test, dt_predictions)))

r square value: 0.7343210234507795
mean absolute error (years): 7.185590045424659
mean square error: 105.79155466897721
root mean sqaure error (years): 10.285502159300595


In [186]:
param_dict={"criterion" : ['friedman_mse', 'mse', 'mae'],
            "max_depth" : range(1,10),
            "min_samples_split" : range(2,10), 
            "min_samples_leaf" : range(2,5)
           }
grid = GridSearchCV(dt_model, param_grid=param_dict, verbose=1, n_jobs=1)
grid.fit(X_train, y_train)
print(grid.best_params_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   20.7s finished


{'criterion': 'mse'}


### ADA Boost Regressor (based on DT)

In [205]:
# instantiate a voting regressor model using sklearn:
abr_model = AdaBoostRegressor(loss='linear', learning_rate= 1, n_estimators= 200)


# fit the model to the training data
abr_model.fit(X_train, y_train)


# check the model's performance on unseen, test data
abr_predictions = abr_model.predict(X_test)

# compare the predictions to the known values (y_test)
print('r square value:', metrics.r2_score(y_test, abr_predictions))
print('mean absolute error (years):', metrics.mean_absolute_error(y_test, abr_predictions))
print('mean square error:', metrics.mean_squared_error(y_test, abr_predictions))
print('root mean sqaure error (years):', np.sqrt(metrics.mean_squared_error(y_test, abr_predictions)))

r square value: 0.8333063763999193
mean absolute error (years): 6.557319592741623
mean square error: 66.3762628985842
root mean sqaure error (years): 8.147162874190267


In [203]:
param_dict={"loss" : ['linear', 'square', 'exponential'],
            "n_estimators" : [10, 50, 100],
            "learning_rate" : [0.5, 1, 5]
           }
grid = GridSearchCV(vr_model, param_grid=param_dict, verbose=1, n_jobs=1)
grid.fit(X_train, y_train)
print(grid.best_params_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  2.0min finished


{'learning_rate': 1, 'n_estimators': 100}


### Neural Network

In [None]:
!pip install tensorflow
import tensorflow as tf
import csv
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit

In [23]:
dataSet = inputData

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)


maxEpoch = 200
epochs = 200
verbNum = 1


base_ann = tf.keras.models.Sequential([
            tf.keras.layers.Dense(units=20, activation='relu'),
            tf.keras.layers.Dense(units=40, activation='relu'),
            tf.keras.layers.Dense(units=60, activation='relu'),
            tf.keras.layers.Dense(units=1, activation='linear')
        ])

# use convolution layers
# drop outs, max pooling

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
base_ann.compile(loss="mse", optimizer=optimizer)
base_ann.fit(X_train, y_train, epochs=maxEpoch, batch_size=64, verbose=verbNum)
predictions = base_ann.predict(X_test)


print("rscore: ", metrics.r2_score(y_test, predictions))
print("MAE: ", metrics.mean_absolute_error(y_test, predictions))
print("MSE: ", np.sqrt(metrics.mean_squared_error(y_test, predictions)))


Train on 3380 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200

Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
rscore:  -7.884063327257378
MAE:  30.824984858613945
MSE:  60.43555346529464
