# Perform Classification on the ADNI dataset using Decision Trees:

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import numpy as np
import time
try:
    import cPickle as thepickle
except ImportError:
    import _pickle as thepickle
from IPython.display import display
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from scipy.interpolate import interp1d
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score


import sys
import os
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = ('ignore::UserWarning,ignore::ConvergenceWarning,ignore::RuntimeWarning')



### Read the data:

Instead of reading the whole database, we read only the data that's useful to us. That is, we read only specific columns of data, and we take only the row containing the first scan for each person. 

In "ADNI Regressional Analysis.ipynb" we have done that exactly, as well as performed linear regression transformation to the imaging data, in order to remove any age, sex, and DLICV_baseline effect. 

Furthermore, in "ADNI OPNMF.ipynb" we have performed dimensionality reduction through the OPNMF method, reducing the number of the ROIs from 145 to just 18. (Hasn't been done, so this does not apply)

Additionally, in "ADNI DeepCCA initial.ipynb" we have transformed the imaging and the genetic data using Deep Canonical Correlation Analysis to dimensionally reduced, maximally linearly correlated data. 

The data is located at "./DATA/ADNI_dataset.csv"
The transformed through LR data is located at "./DATA/Linearly_Transformed_Unique_Dataset.pkl"
The further transormed (through DCCA) data is located at "./DATA/ADNI_initial_DCCA_features.pkl"

(Need to run the RA and DCCA code if data is not found)

In [32]:
# Read the original data:
data = pd.read_csv("DATA/ADNI_dataset.csv", low_memory=False) # Need the low_memory or dtypes warning
data.replace({'Sex':{'F':1, 'M':0}}, inplace=True)

# The columns that interest us are the sex and age related, the ROIs, as well as the genetic data:
columns_of_interest = ['PTID',
                       'Date',
                       'Age', 
                       'Sex',
                       'DLICV_baseline',
                       'APOE4_Alleles',
                       'APOE_Genotype',
                       'Diagnosis_nearest_2.0']
c = list(data.columns)
MRI_columns = c[c.index("MUSE_Volume_4"):c.index("MUSE_Volume_207")+1]
genetic_columns = c[c.index("rs4575098"):c.index("rs429358")+1]

columns_of_interest += MRI_columns + genetic_columns

# Need the dropna because some first PTIDs have no MRI
data_of_interest = data[columns_of_interest].dropna(subset=['MUSE_Volume_4', 'DLICV_baseline'])


unique = data_of_interest.drop_duplicates(subset=['PTID'], keep='first')
u = unique.dropna() # only 2 values in Diagnosis_nearest_2.0' have NaN, easier to drop them:
unique = u
unique['Diagnosis_nearest_2.0'] = unique['Diagnosis_nearest_2.0'].astype('category')
unique['Diagnosis_nearest_2.0_cat'] = unique['Diagnosis_nearest_2.0'].cat.codes
print(unique.shape)
unique.head(15)

(1567, 208)


Unnamed: 0,PTID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,MUSE_Volume_4,MUSE_Volume_11,...,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358,Diagnosis_nearest_2.0_cat
0,002_S_0295,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,1873.124153,1586.249283,...,1,1,1,0,0,0,0,0,1,0
9,002_S_0413,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,2131.516933,1505.034469,...,0,1,1,0,1,0,0,0,0,0
24,002_S_0559,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,2366.71768,3157.732947,...,0,0,0,0,1,0,0,0,0,0
31,002_S_0619,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,5124.734093,2981.605944,...,0,0,0,1,1,0,0,0,2,1
36,002_S_0685,2006-07-06,89.561644,1,1372862.125,0.0,E3/E3,CN,2941.520445,1693.826402,...,1,1,1,0,0,0,0,0,0,0
45,002_S_0729,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,966.09517,1921.643449,...,0,0,0,1,1,0,0,0,1,2
64,002_S_0816,2006-08-30,70.767123,0,1444128.125,2.0,E4/E4,Dementia,1427.160586,1604.163157,...,0,0,0,0,1,0,0,0,2,1
69,002_S_0938,2006-10-05,82.167123,1,1309685.0,0.0,E3/E3,Dementia,1931.131939,1136.952611,...,0,1,1,0,1,0,0,0,0,1
74,002_S_0954,2006-10-10,69.19863,1,1075661.5,1.0,E3/E4,MCI,707.696352,2621.956978,...,2,1,1,0,1,0,0,0,1,2
81,002_S_0955,2006-10-11,78.161644,1,1363607.0,1.0,E3/E4,Dementia,2681.014413,1374.257191,...,1,0,0,0,1,0,0,0,1,1


In [33]:
# Read the data transformed through the Regressional Analysis:
lr_data = pd.read_pickle("./DATA/Linearly_Transformed_Unique_Dataset.pkl")
print(lr_data.shape)
lr_data.head(15)

(1302, 209)


Unnamed: 0,PTID,MRID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,MUSE_Volume_4,...,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358,Diagnosis_nearest_2.0_cat
0,002_S_0295,002_S_0295_2006-04-18,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,-401.428503,...,1,1,1,0,0,0,0,0,1,0
9,002_S_0413,002_S_0413_2006-05-02,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,596.355045,...,0,1,1,0,1,0,0,0,0,0
24,002_S_0559,002_S_0559_2006-05-23,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,224.87456,...,0,0,0,0,1,0,0,0,0,0
31,002_S_0619,002_S_0619_2006-06-01,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,2633.277779,...,0,0,0,1,1,0,0,0,2,1
45,002_S_0729,002_S_0729_2006-07-17,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,256.289641,...,0,0,0,1,1,0,0,0,1,2
64,002_S_0816,002_S_0816_2006-08-30,2006-08-30,70.767123,0,1444128.125,2.0,E4/E4,Dementia,-126.260419,...,0,0,0,0,1,0,0,0,2,1
69,002_S_0938,002_S_0938_2006-10-05,2006-10-05,82.167123,1,1309685.0,0.0,E3/E3,Dementia,200.102369,...,0,1,1,0,1,0,0,0,0,1
74,002_S_0954,002_S_0954_2006-10-10,2006-10-10,69.19863,1,1075661.5,1.0,E3/E4,MCI,-60.539913,...,2,1,1,0,1,0,0,0,1,2
81,002_S_0955,002_S_0955_2006-10-11,2006-10-11,78.161644,1,1363607.0,1.0,E3/E4,Dementia,1058.028132,...,1,0,0,0,1,0,0,0,1,1
84,002_S_1018,002_S_1018_2006-11-29,2006-11-29,70.658904,1,1355603.0,0.0,E3/E3,Dementia,-485.048304,...,1,1,1,0,0,0,0,0,0,1


In [34]:
# Create a new dataset and drop the imaging and genetic data:
c = list(lr_data.columns)
MRI_columns = c[c.index("MUSE_Volume_4"):c.index("MUSE_Volume_207")+1]
genetic_columns = c[c.index("rs4575098"):c.index("rs429358")+1]
columns_to_drop = MRI_columns + genetic_columns
dcca_data = lr_data.drop(labels = columns_to_drop, axis=1)

# Read the data transformed through DCCA:
with open("./DATA/ADNI_initial_DCCA_features_300_4.pkl", 'rb') as f:
    dcca_transformed_data_file = pickle.load(f)
transformed_imaging_data = dcca_transformed_data_file[0]
transformed_genetic_data = dcca_transformed_data_file[1]
print("Transformed imaging data dimensions: \n" , transformed_imaging_data.shape)
print("Transformed genetic data dimensions: \n" , transformed_genetic_data.shape)

# Embed them into the new dataset:
imaging_labels = ["imaging_component_"+str(x+1) for x in range(transformed_imaging_data.shape[1])] 
genetic_labels = ["genetic_component_"+str(x+1) for x in range(transformed_genetic_data.shape[1])] 
dcca_data[genetic_labels] = transformed_genetic_data
dcca_data[imaging_labels] = transformed_imaging_data
print("DCCA Data Dimensions: \n",dcca_data.shape)
dcca_data.head(15)

Transformed imaging data dimensions: 
 (1302, 300)
Transformed genetic data dimensions: 
 (1302, 300)
DCCA Data Dimensions: 
 (1302, 610)


Unnamed: 0,PTID,MRID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,Diagnosis_nearest_2.0_cat,...,imaging_component_291,imaging_component_292,imaging_component_293,imaging_component_294,imaging_component_295,imaging_component_296,imaging_component_297,imaging_component_298,imaging_component_299,imaging_component_300
0,002_S_0295,002_S_0295_2006-04-18,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,0,...,-0.059044,1.083234,-0.430268,1.09149,0.159566,-2.281677,-1.381328,-0.790197,2.888934,2.863482
9,002_S_0413,002_S_0413_2006-05-02,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,0,...,2.031326,-1.779694,-0.857111,0.321846,-0.289536,-1.963561,-1.312737,0.904873,1.56282,4.308747
24,002_S_0559,002_S_0559_2006-05-23,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,0,...,2.248068,-0.112495,-0.292899,0.915463,0.627108,-1.011987,-0.791272,-1.478453,2.00791,3.44824
31,002_S_0619,002_S_0619_2006-06-01,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,1,...,-1.189468,-1.365852,1.741578,0.149422,1.127712,0.439175,2.208321,1.431495,-1.001742,-3.381265
45,002_S_0729,002_S_0729_2006-07-17,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,2,...,1.135018,0.565758,-1.231374,2.53093,-0.140982,1.115291,0.911285,0.161683,4.860322,1.851795
64,002_S_0816,002_S_0816_2006-08-30,2006-08-30,70.767123,0,1444128.125,2.0,E4/E4,Dementia,1,...,1.709237,-0.197034,-2.35414,0.458232,-0.784625,-2.768122,-0.952438,1.013662,2.402303,1.102402
69,002_S_0938,002_S_0938_2006-10-05,2006-10-05,82.167123,1,1309685.0,0.0,E3/E3,Dementia,1,...,0.484417,0.221897,-0.454551,2.132559,0.266509,-0.421897,-1.680727,0.075354,2.091559,2.240144
74,002_S_0954,002_S_0954_2006-10-10,2006-10-10,69.19863,1,1075661.5,1.0,E3/E4,MCI,2,...,4.188686,-0.134003,-0.009666,1.683453,-0.314767,-1.111206,-1.99926,0.37567,4.115875,1.451842
81,002_S_0955,002_S_0955_2006-10-11,2006-10-11,78.161644,1,1363607.0,1.0,E3/E4,Dementia,1,...,0.825982,0.231535,-2.114397,3.01193,-0.534751,0.321073,1.041867,3.164325,4.477453,3.07091
84,002_S_1018,002_S_1018_2006-11-29,2006-11-29,70.658904,1,1355603.0,0.0,E3/E3,Dementia,1,...,2.190969,-0.2504,-1.055071,1.670613,-2.721373,0.470994,-1.087574,-0.159993,3.650605,0.85122


### Perform Classification using Decision Trees:

In [35]:
def perform_GridSearchCV(model, params, X, Y):
    clf = GridSearchCV(model, params, n_jobs=-1, cv=10)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, shuffle=False)
    clf.fit(X_train,Y_train)
    best_param = clf.best_params_
    pred =  clf.predict(X_test)
    acc = accuracy_score(Y_test, pred)
    f1 = f1_score(Y_test, pred, average='weighted')
    bal_acc = balanced_accuracy_score(Y_test, pred)
    cm = confusion_matrix(Y_test, pred, normalize='true')
    return acc, f1, bal_acc, cm, best_param

In [36]:
lr_data.head()

Unnamed: 0,PTID,MRID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,MUSE_Volume_4,...,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358,Diagnosis_nearest_2.0_cat
0,002_S_0295,002_S_0295_2006-04-18,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,-401.428503,...,1,1,1,0,0,0,0,0,1,0
9,002_S_0413,002_S_0413_2006-05-02,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,596.355045,...,0,1,1,0,1,0,0,0,0,0
24,002_S_0559,002_S_0559_2006-05-23,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,224.87456,...,0,0,0,0,1,0,0,0,0,0
31,002_S_0619,002_S_0619_2006-06-01,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,2633.277779,...,0,0,0,1,1,0,0,0,2,1
45,002_S_0729,002_S_0729_2006-07-17,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,256.289641,...,0,0,0,1,1,0,0,0,1,2


In [37]:
dcca_data.head()

Unnamed: 0,PTID,MRID,Date,Age,Sex,DLICV_baseline,APOE4_Alleles,APOE_Genotype,Diagnosis_nearest_2.0,Diagnosis_nearest_2.0_cat,...,imaging_component_291,imaging_component_292,imaging_component_293,imaging_component_294,imaging_component_295,imaging_component_296,imaging_component_297,imaging_component_298,imaging_component_299,imaging_component_300
0,002_S_0295,002_S_0295_2006-04-18,2006-04-18,84.742466,0,1485405.375,1.0,E3/E4,CN,0,...,-0.059044,1.083234,-0.430268,1.09149,0.159566,-2.281677,-1.381328,-0.790197,2.888934,2.863482
9,002_S_0413,002_S_0413_2006-05-02,2006-05-02,76.283562,1,1364116.0,0.0,E3/E3,CN,0,...,2.031326,-1.779694,-0.857111,0.321846,-0.289536,-1.963561,-1.312737,0.904873,1.56282,4.308747
24,002_S_0559,002_S_0559_2006-05-23,2006-05-23,79.223288,0,1570479.625,1.0,E3/E4,CN,0,...,2.248068,-0.112495,-0.292899,0.915463,0.627108,-1.011987,-0.791272,-1.478453,2.00791,3.44824
31,002_S_0619,002_S_0619_2006-06-01,2006-06-01,77.447945,0,1859348.25,2.0,E4/E4,Dementia,1,...,-1.189468,-1.365852,1.741578,0.149422,1.127712,0.439175,2.208321,1.431495,-1.001742,-3.381265
45,002_S_0729,002_S_0729_2006-07-17,2006-07-17,65.056164,1,1166961.75,1.0,E3/E4,MCI,2,...,1.135018,0.565758,-1.231374,2.53093,-0.140982,1.115291,0.911285,0.161683,4.860322,1.851795


In [38]:
params = {'criterion':['gini', 'entropy'],
          'class_weight':[None,'balanced'],
          'ccp_alpha':[0,0.001,0.01,0.1,1,10]}

print("######################################################################## Both:")
X , Y = lr_data[MRI_columns + genetic_columns], lr_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_u, f1_u, bal_acc_u, cm_u, b_p_u = perform_GridSearchCV(model, params, X, Y)

X , Y = dcca_data[imaging_labels + genetic_labels], dcca_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_t, f1_t, bal_acc_t, cm_t, b_p_t = perform_GridSearchCV(model, params, X, Y)

print("no-DCCA trained Accuracy: ", round(np.mean(acc_u)*100,3))
print("DCCA Trained Accuracy:   ", round(np.mean(acc_t)*100,3))
print("no-DCCA trained F1 Score: ", round(np.mean(f1_u)*100,3))
print("DCCA Trained F1 Score:   ", round(np.mean(f1_t)*100,3))
print("no-DCCA trained Balanced Accuracy Score: ", round(np.mean(bal_acc_u)*100,3))
print("DCCA Trained Balanced Accuracy Score:   ", round(np.mean(bal_acc_t)*100,3))
print("Best Parameters for untrained data:",b_p_u)
print("Best Parameters for trained data:",b_p_t)
print("######################################################################## Imaging only:")
X , Y = lr_data[MRI_columns], lr_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_u, f1_u, bal_acc_u, cm_u, b_p_u = perform_GridSearchCV(model, params, X, Y)

X , Y = dcca_data[imaging_labels], dcca_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_t, f1_t, bal_acc_t, cm_t, b_p_t = perform_GridSearchCV(model, params, X, Y)

print("no-DCCA trained Accuracy: ", round(np.mean(acc_u)*100,3))
print("DCCA Trained Accuracy:   ", round(np.mean(acc_t)*100,3))
print("no-DCCA trained F1 Score: ", round(np.mean(f1_u)*100,3))
print("DCCA Trained F1 Score:   ", round(np.mean(f1_t)*100,3))
print("no-DCCA trained Balanced Accuracy Score: ", round(np.mean(bal_acc_u)*100,3))
print("DCCA Trained Balanced Accuracy Score:   ", round(np.mean(bal_acc_t)*100,3))
print("Best Parameters for untrained data:",b_p_u)
print("Best Parameters for trained data:",b_p_t)
print("######################################################################### Genetic only:")
X , Y = lr_data[genetic_columns], lr_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_u, f1_u, bal_acc_u, cm_u, b_p_u = perform_GridSearchCV(model, params, X, Y)

X , Y = dcca_data[genetic_labels], dcca_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_t, f1_t, bal_acc_t, cm_t, b_p_t = perform_GridSearchCV(model, params, X, Y)

print("no-DCCA trained Accuracy: ", round(np.mean(acc_u)*100,3))
print("DCCA Trained Accuracy:   ", round(np.mean(acc_t)*100,3))
print("no-DCCA trained F1 Score: ", round(np.mean(f1_u)*100,3))
print("DCCA Trained F1 Score:   ", round(np.mean(f1_t)*100,3))
print("no-DCCA trained Balanced Accuracy Score: ", round(np.mean(bal_acc_u)*100,3))
print("DCCA Trained Balanced Accuracy Score:   ", round(np.mean(bal_acc_t)*100,3))
print("Best Parameters for untrained data:",b_p_u)
print("Best Parameters for trained data:",b_p_t)

######################################################################## Both:
no-DCCA trained Accuracy:  51.341
DCCA Trained Accuracy:    51.341
no-DCCA trained F1 Score:  49.596
DCCA Trained F1 Score:    34.834
no-DCCA trained Balanced Accuracy Score:  47.382
DCCA Trained Balanced Accuracy Score:    33.333
Best Parameters for untrained data: {'ccp_alpha': 0.01, 'class_weight': None, 'criterion': 'gini'}
Best Parameters for trained data: {'ccp_alpha': 0.1, 'class_weight': None, 'criterion': 'gini'}
######################################################################## Imaging only:
no-DCCA trained Accuracy:  51.341
DCCA Trained Accuracy:    51.341
no-DCCA trained F1 Score:  49.596
DCCA Trained F1 Score:    34.834
no-DCCA trained Balanced Accuracy Score:  47.382
DCCA Trained Balanced Accuracy Score:    33.333
Best Parameters for untrained data: {'ccp_alpha': 0.01, 'class_weight': None, 'criterion': 'gini'}
Best Parameters for trained data: {'ccp_alpha': 0.1, 'class_weight': None, 'cr

## Try with balancing & scaling:

In [43]:
# Read data:
lr_data = pd.read_pickle("./DATA/Linearly_Transformed_Unique_Dataset.pkl")
c = list(lr_data.columns)
MRI_columns = c[c.index("MUSE_Volume_4"):c.index("MUSE_Volume_207")+1]
genetic_columns = c[c.index("rs4575098"):c.index("rs429358")+1]
columns_to_drop = MRI_columns + genetic_columns
dcca_data = lr_data.drop(labels = columns_to_drop, axis=1)
with open("./DATA/ADNI_initial_DCCA_features_300_4.pkl", 'rb') as f:
    dcca_transformed_data_file = pickle.load(f)
transformed_imaging_data = dcca_transformed_data_file[0]
transformed_genetic_data = dcca_transformed_data_file[1]
imaging_labels = ["imaging_component_"+str(x+1) for x in range(transformed_imaging_data.shape[1])] 
genetic_labels = ["genetic_component_"+str(x+1) for x in range(transformed_genetic_data.shape[1])] 
dcca_data[genetic_labels] = transformed_genetic_data
dcca_data[imaging_labels] = transformed_imaging_data

# Scale the data:
MRI_columns = c[c.index("MUSE_Volume_4"):c.index("MUSE_Volume_207")+1]
genetic_columns = c[c.index("rs4575098"):c.index("rs429358")+1]
scaler = preprocessing.StandardScaler()
lr_data_scaled = scaler.fit_transform(lr_data[MRI_columns])
lr_data[MRI_columns] = lr_data_scaled
scaler = preprocessing.StandardScaler()
dcca_data_scaled = scaler.fit_transform(dcca_data[imaging_labels])
dcca_data[imaging_columns] = dcca_data_scaled

# Balance the data through Undersampling:
def undersample(df, ylabel='Diagnosis_nearest_2.0'):
    classes = df[ylabel].value_counts().to_dict() # specific to this dataset
    least_class_amount = min(classes.values())
    classes_list = []
    for key in classes:
        classes_list.append(df[df[ylabel] == key]) 
    classes_sample = []
    for i in range(0,len(classes_list)-1):
        classes_sample.append(classes_list[i].sample(least_class_amount))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,classes_list[-1]], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

lr_data = undersample(lr_data)
dcca_data = undersample(dcca_data)
print(lr_data.shape)
print(dcca_data.shape)

(729, 209)
(729, 610)


In [44]:
params = {'criterion':['gini', 'entropy'],
          'class_weight':[None,'balanced'],
          'ccp_alpha':[0,0.001,0.01,0.1,1,10]}

print("######################################################################## Both:")
X , Y = lr_data[MRI_columns + genetic_columns], lr_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_u, f1_u, bal_acc_u, cm_u, b_p_u = perform_GridSearchCV(model, params, X, Y)

X , Y = dcca_data[imaging_labels + genetic_labels], dcca_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_t, f1_t, bal_acc_t, cm_t, b_p_t = perform_GridSearchCV(model, params, X, Y)

print("no-DCCA trained Accuracy: ", round(np.mean(acc_u)*100,3))
print("DCCA Trained Accuracy:   ", round(np.mean(acc_t)*100,3))
print("no-DCCA trained F1 Score: ", round(np.mean(f1_u)*100,3))
print("DCCA Trained F1 Score:   ", round(np.mean(f1_t)*100,3))
print("no-DCCA trained Balanced Accuracy Score: ", round(np.mean(bal_acc_u)*100,3))
print("DCCA Trained Balanced Accuracy Score:   ", round(np.mean(bal_acc_t)*100,3))
print("Best Parameters for untrained data:",b_p_u)
print("Best Parameters for trained data:",b_p_t)
print("######################################################################## Imaging only:")
X , Y = lr_data[MRI_columns], lr_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_u, f1_u, bal_acc_u, cm_u, b_p_u = perform_GridSearchCV(model, params, X, Y)

X , Y = dcca_data[imaging_labels], dcca_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_t, f1_t, bal_acc_t, cm_t, b_p_t = perform_GridSearchCV(model, params, X, Y)

print("no-DCCA trained Accuracy: ", round(np.mean(acc_u)*100,3))
print("DCCA Trained Accuracy:   ", round(np.mean(acc_t)*100,3))
print("no-DCCA trained F1 Score: ", round(np.mean(f1_u)*100,3))
print("DCCA Trained F1 Score:   ", round(np.mean(f1_t)*100,3))
print("no-DCCA trained Balanced Accuracy Score: ", round(np.mean(bal_acc_u)*100,3))
print("DCCA Trained Balanced Accuracy Score:   ", round(np.mean(bal_acc_t)*100,3))
print("Best Parameters for untrained data:",b_p_u)
print("Best Parameters for trained data:",b_p_t)
print("######################################################################### Genetic only:")
X , Y = lr_data[genetic_columns], lr_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_u, f1_u, bal_acc_u, cm_u, b_p_u = perform_GridSearchCV(model, params, X, Y)

X , Y = dcca_data[genetic_labels], dcca_data["Diagnosis_nearest_2.0_cat"]
model = DecisionTreeClassifier()
acc_t, f1_t, bal_acc_t, cm_t, b_p_t = perform_GridSearchCV(model, params, X, Y)

print("no-DCCA trained Accuracy: ", round(np.mean(acc_u)*100,3))
print("DCCA Trained Accuracy:   ", round(np.mean(acc_t)*100,3))
print("no-DCCA trained F1 Score: ", round(np.mean(f1_u)*100,3))
print("DCCA Trained F1 Score:   ", round(np.mean(f1_t)*100,3))
print("no-DCCA trained Balanced Accuracy Score: ", round(np.mean(bal_acc_u)*100,3))
print("DCCA Trained Balanced Accuracy Score:   ", round(np.mean(bal_acc_t)*100,3))
print("Best Parameters for untrained data:",b_p_u)
print("Best Parameters for trained data:",b_p_t)

######################################################################## Both:
no-DCCA trained Accuracy:  48.63
DCCA Trained Accuracy:    23.288
no-DCCA trained F1 Score:  65.438
DCCA Trained F1 Score:    37.778
no-DCCA trained Balanced Accuracy Score:  48.63
DCCA Trained Balanced Accuracy Score:    23.288
Best Parameters for untrained data: {'ccp_alpha': 0.01, 'class_weight': 'balanced', 'criterion': 'gini'}
Best Parameters for trained data: {'ccp_alpha': 0, 'class_weight': 'balanced', 'criterion': 'entropy'}
######################################################################## Imaging only:
no-DCCA trained Accuracy:  51.37
DCCA Trained Accuracy:    13.699
no-DCCA trained F1 Score:  67.873
DCCA Trained F1 Score:    24.096
no-DCCA trained Balanced Accuracy Score:  51.37
DCCA Trained Balanced Accuracy Score:    13.699
Best Parameters for untrained data: {'ccp_alpha': 0.01, 'class_weight': 'balanced', 'criterion': 'gini'}
Best Parameters for trained data: {'ccp_alpha': 0.01, 'class_we