# Predict Composite Cognitive Score 
Predict the composite cognitive score of a set of ADNI patients using Random Forrest and SVM methods. 

We are using the four ADSP-PHC composite scores for *Memory, Executive Function, Language and Visuospatial Ability*. The methods for deriving these are described in 'ADSP Phenotype Harmonization Consortium – Derivation of Cognitive Composite Scores' by Mukherjee et al (https://ida.loni.usc.edu/download/files/study/083f5b49-98d1-494a-aaf1-3310a9a8e62c/file/adni/ADNI_Cognition_Methods_Psychometric_Analyses_Oct2022.pdf).

In [1]:
import pandas as pd
import sys
import os
import numpy as np
from scipy.io import loadmat

# scikit-learn modules
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.metrics import mean_squared_error # for calculating the cost function
from sklearn.ensemble import RandomForestRegressor # for building the model

import matplotlib.pyplot as plt 

## Processing Data

Match up the composite cognitive scores and functional connectivity data, then split into test + training sets

In [2]:
ADSP_DATA_PATH = "data/ADSP_PHC_COGN_Dec2023_FILTERED.csv"
FC_DATA_PATH = "../FMRI_ADNI_DATA/fc/"

In [57]:
# Process the ADSP Data

adsp_df = pd.read_csv(ADSP_DATA_PATH)
adsp_df = adsp_df.drop(columns=adsp_df.columns[0])
adsp_df.head()

Unnamed: 0,RID,SUBJID,PHASE,VISCODE,VISCODE2,EXAMDATE,PHC_Visit,PHC_Sex,PHC_Education,PHC_Ethnicity,...,PHC_MEM_PreciseFilter,PHC_EXF,PHC_EXF_SE,PHC_EXF_PreciseFilter,PHC_LAN,PHC_LAN_SE,PHC_LAN_PreciseFilter,PHC_VSP,PHC_VSP_SE,PHC_VSP_PreciseFilter
0,21,ADNI_011_S_0021,ADNI1,bl,bl,2005-10-24,1,2.0,18.0,2.0,...,1,0.295,0.335,1.0,0.816,0.304,1.0,0.264,0.547,1.0
1,21,ADNI_011_S_0021,ADNI1,m06,m06,2006-04-24,2,2.0,18.0,2.0,...,1,0.374,0.346,1.0,1.372,0.384,1.0,-0.333,0.464,1.0
2,21,ADNI_011_S_0021,ADNI1,m12,m12,2006-11-01,3,2.0,18.0,2.0,...,1,0.451,0.388,1.0,1.813,0.368,1.0,0.264,0.547,1.0
3,21,ADNI_011_S_0021,ADNI1,m24,m24,2007-10-31,4,2.0,18.0,2.0,...,1,0.534,0.351,1.0,1.17,0.316,1.0,0.264,0.547,1.0
4,21,ADNI_011_S_0021,ADNI1,m36,m36,2008-10-22,5,2.0,18.0,2.0,...,1,0.669,0.424,1.0,1.274,0.342,1.0,0.963,0.658,0.0


In [58]:
adsp_df = adsp_df.drop(columns=[
    'SUBJID', 'PHASE', 'VISCODE', 'EXAMDATE', 'PHC_Visit', 'PHC_Sex', 'PHC_Education', 'PHC_Ethnicity', 'PHC_Race', 'PHC_Age_Cognition', 
    'PHC_MEM_SE', 'PHC_MEM_PreciseFilter', 'PHC_EXF_SE', 'PHC_EXF_PreciseFilter', 'PHC_LAN_SE', 'PHC_LAN_PreciseFilter', 'PHC_VSP_SE',
    'PHC_VSP_PreciseFilter'
])
adsp_df.head()

Unnamed: 0,RID,VISCODE2,PHC_Diagnosis,PHC_MEM,PHC_EXF,PHC_LAN,PHC_VSP
0,21,bl,1.0,1.481,0.295,0.816,0.264
1,21,m06,1.0,1.464,0.374,1.372,-0.333
2,21,m12,1.0,1.647,0.451,1.813,0.264
3,21,m24,1.0,1.309,0.534,1.17,0.264
4,21,m36,1.0,1.945,0.669,1.274,0.963


In [59]:
def replace_viscode(str):
    if str == 'BL' or str == 'SC':
        return adsp_df['VISCODE2'].replace(str, 'M000')
    else:
        vis = str[1:]
        vis = vis.zfill(3)
        vis = 'M' + vis
        return adsp_df['VISCODE2'].replace(str, vis)

adsp_df['VISCODE2'] = adsp_df['VISCODE2'].str.upper()

# Pad the visit codes
for val in adsp_df['VISCODE2'].unique():
    adsp_df['VISCODE2'] = replace_viscode(val)

# Pad the RID values
adsp_df['RID'] = adsp_df['RID'].apply(lambda x: str(x).zfill(4))

adsp_df.head()

Unnamed: 0,RID,VISCODE2,PHC_Diagnosis,PHC_MEM,PHC_EXF,PHC_LAN,PHC_VSP
0,21,M000,1.0,1.481,0.295,0.816,0.264
1,21,M006,1.0,1.464,0.374,1.372,-0.333
2,21,M012,1.0,1.647,0.451,1.813,0.264
3,21,M024,1.0,1.309,0.534,1.17,0.264
4,21,M036,1.0,1.945,0.669,1.274,0.963


Get the FC data and add

In [60]:
import re

def get_rid_viscode(filename):
    pattern = r'sub-ADNI\d+S(\d{4})_ses-(M\d{3})'
    match = re.search(pattern, filename)

    if match:
        rid = match.group(1)
        viscode = match.group(2)
        return rid, viscode        
    else:
        print("Pattern not found in the filename.")
        return None


In [61]:
adsp_df['FC_DATA'] = None

fc_dir = os.listdir(FC_DATA_PATH)

fc_files = [os.path.join(FC_DATA_PATH, file) for file in fc_dir if file.endswith('.mat')]
len(fc_files)

1478

In [62]:
adsp_df.shape

(4074, 8)

In [63]:
for fc in fc_files:
    rid, viscode = get_rid_viscode(fc)
    adsp_df.loc[(adsp_df['RID'] == rid) & (adsp_df['VISCODE2'] == viscode), 'FC_DATA'] = fc

In [73]:
adsp_df_filtered = adsp_df[adsp_df['FC_DATA'].notna()]
adsp_df_filtered.shape

(1353, 8)

In [74]:
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M162'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M174'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M180'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M186'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M192'].index)
adsp_df_filtered.shape

(1343, 8)

In [44]:
# Save the adsp_df_filtered dataframe as a file
# adsp_df_filtered.to_csv('data/ADSP_PHC_COGN_Dec2023_FILTERED_wfiles.csv')

In [76]:
# Get the FC data as numpy arrays
dim_x = len(adsp_df_filtered['FC_DATA'])
features = np.zeros(shape=(dim_x, 100, 200)) # get the first 100 regions

for i, file in enumerate(adsp_df_filtered['FC_DATA'].values):
    arr = loadmat(file)['ROI_activity'][:100, :] # get the first 100 regions
    if arr.shape[1] != 200:
        # add padding to get a constant shape
        diff = 200 - arr.shape[1]
        if diff < 0:
            arr = arr[:, :200]
        else:
            pad_width = ((0, 0), (0, diff))  
            padded_array = np.pad(arr, pad_width, mode='constant', constant_values=0)
    features[i] = padded_array
features.shape

(1343, 100, 200)

In [77]:
y = adsp_df_filtered[['PHC_MEM', 'PHC_EXF', 'PHC_LAN', 'PHC_VSP']]
y.head()

Unnamed: 0,PHC_MEM,PHC_EXF,PHC_LAN,PHC_VSP
10,1.377,-0.092,0.666,0.963
91,0.902,0.579,0.757,0.264
119,0.645,0.525,0.448,-0.041
132,1.134,0.149,1.011,0.264
133,1.138,0.501,0.71,0.963


In [78]:
# split into test + training (80% train, 20% test)
features_2d = features.reshape(features.shape[0], -1)
x_train, x_test, y_train, y_test = train_test_split(features_2d, y, test_size = 0.2, random_state = 28)

## Random Forrest Method
Prediction not differentiable wrt to input - need a model per composite (memory, executive function, language and visuospatial)

In [80]:
# split targets into the different composites

# y_train_mem, y_train_exf, y_train_lan, y_train_vsp = y_train['PHC_MEM'], y_train['PHC_EXF'], y_train['PHC_LAN'], y_train['PHC_VSP']
# y_test_mem, y_test_exf, y_test_lan, y_test_vsp = y_test['PHC_MEM'], y_test['PHC_EXF'], y_test['PHC_LAN'], y_test['PHC_VSP']

#### Memory Model

In [81]:
# MEMORY MODEL

# Remove NaNs in target
y_train_mem = y_train['PHC_MEM'].reset_index(drop=True)
y_test_mem =y_test['PHC_MEM'].reset_index(drop=True)

nan_indices = y_train_mem.index[y_train_mem.isna()]
y_train_mem = y_train_mem.drop(nan_indices)
x_train_mem = np.delete(x_train, nan_indices, axis = 0)
# print(nan_indices)

nan_indices_test = y_test_mem.index[y_test_mem.isna()]
y_test_mem = y_test_mem.drop(nan_indices_test)
x_test_mem = np.delete(x_test, nan_indices_test, axis = 0)

In [83]:
# Initializing the Random Forest Regression model with 10 decision trees
base_model_mem = RandomForestRegressor(n_estimators = 10, random_state = 5)

# Fitting the Random Forest Regression model to the data
base_model_mem.fit(x_train_mem, y_train_mem)

In [85]:
# x_test_mem.shape
# Predicting the target values of the test set
base_y_pred_mem = base_model_mem.predict(x_test_mem)

In [86]:
from sklearn.metrics import r2_score

base_r2_mem = r2_score(y_test_mem, base_y_pred_mem)
print("Baseline R2 (MEM): ", base_r2_mem)

Baseline R2 (MEM):  -0.03279929952682026


In [88]:
print(base_model_mem.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': 5, 'verbose': 0, 'warm_start': False}


#### Random Search for best Hyperparameters for Memory Predictor

In [91]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, r2_score

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 100)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 119, 138, 157, 176, 195, 215, 234, 253, 272, 291, 311, 330, 349, 368, 387, 407, 426, 445, 464, 483, 503, 522, 541, 560, 579, 598, 618, 637, 656, 675, 694, 714, 733, 752, 771, 790, 810, 829, 848, 867, 886, 906, 925, 944, 963, 982, 1002, 1021, 1040, 1059, 1078, 1097, 1117, 1136, 1155, 1174, 1193, 1213, 1232, 1251, 1270, 1289, 1309, 1328, 1347, 1366, 1385, 1405, 1424, 1443, 1462, 1481, 1501, 1520, 1539, 1558, 1577, 1596, 1616, 1635, 1654, 1673, 1692, 1712, 1731, 1750, 1769, 1788, 1808, 1827, 1846, 1865, 1884, 1904, 1923, 1942, 1961, 1980, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Evaluation metric
r2_scorer = make_scorer(r2_score)

# Random search of parameters, using 5 fold cross validation, 

# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf, param_distributions = random_grid, scoring=r2_scorer,
    n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(x_train_mem, y_train_mem)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
# Get the best parameter set
best_params = rf_random.best_params_

print("Best parameters:", best_params)