# Predict Composite Cognitive Score 
Predict the composite cognitive score of a set of ADNI patients using Random Forrest and SVM methods. 

We are using the four ADSP-PHC composite scores for *Memory, Executive Function, Language and Visuospatial Ability*. The methods for deriving these are described in 'ADSP Phenotype Harmonization Consortium – Derivation of Cognitive Composite Scores' by Mukherjee et al (https://ida.loni.usc.edu/download/files/study/083f5b49-98d1-494a-aaf1-3310a9a8e62c/file/adni/ADNI_Cognition_Methods_Psychometric_Analyses_Oct2022.pdf).

In [None]:
import pandas as pd
import sys
import os
import numpy as np
from scipy.io import loadmat

# scikit-learn modules
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.ensemble import RandomForestRegressor # for building the model

import matplotlib.pyplot as plt 

## Processing Data

Match up the composite cognitive scores and functional connectivity data, then split into test + training sets

In [None]:
ADSP_DATA_PATH = "data/ADSP_PHC_COGN_Dec2023_FILTERED.csv"
FC_DATA_PATH = "../FMRI_ADNI_DATA/fc/"

In [None]:
# Process the ADSP Data

adsp_df = pd.read_csv(ADSP_DATA_PATH)
adsp_df = adsp_df.drop(columns=adsp_df.columns[0])
adsp_df.head()

In [None]:
adsp_df = adsp_df.drop(columns=[
    'SUBJID', 'PHASE', 'VISCODE', 'EXAMDATE', 'PHC_Visit', 'PHC_Sex', 'PHC_Education', 'PHC_Ethnicity', 'PHC_Race', 'PHC_Age_Cognition', 
    'PHC_MEM_SE', 'PHC_MEM_PreciseFilter', 'PHC_EXF_SE', 'PHC_EXF_PreciseFilter', 'PHC_LAN_SE', 'PHC_LAN_PreciseFilter', 'PHC_VSP_SE',
    'PHC_VSP_PreciseFilter'
])
adsp_df.head()

In [None]:
def replace_viscode(str):
    if str == 'BL' or str == 'SC':
        return adsp_df['VISCODE2'].replace(str, 'M000')
    else:
        vis = str[1:]
        vis = vis.zfill(3)
        vis = 'M' + vis
        return adsp_df['VISCODE2'].replace(str, vis)

adsp_df['VISCODE2'] = adsp_df['VISCODE2'].str.upper()

# Pad the visit codes
for val in adsp_df['VISCODE2'].unique():
    adsp_df['VISCODE2'] = replace_viscode(val)

# Pad the RID values
adsp_df['RID'] = adsp_df['RID'].apply(lambda x: str(x).zfill(4))

adsp_df.head()

Get the FC data and add

In [None]:
import re

def get_rid_viscode(filename):
    pattern = r'sub-ADNI\d+S(\d{4})_ses-(M\d{3})'
    match = re.search(pattern, filename)

    if match:
        rid = match.group(1)
        viscode = match.group(2)
        return rid, viscode        
    else:
        print("Pattern not found in the filename.")
        return None


In [None]:
adsp_df['FC_DATA'] = None

fc_dir = os.listdir(FC_DATA_PATH)

fc_files = [os.path.join(FC_DATA_PATH, file) for file in fc_dir if file.endswith('.mat')]
len(fc_files)

In [None]:
adsp_df.shape

In [None]:
for fc in fc_files:
    rid, viscode = get_rid_viscode(fc)
    adsp_df.loc[(adsp_df['RID'] == rid) & (adsp_df['VISCODE2'] == viscode), 'FC_DATA'] = fc

In [None]:
adsp_df_filtered = adsp_df[adsp_df['FC_DATA'].notna()]
adsp_df_filtered.shape

In [None]:
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M162'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M174'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M180'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M186'].index)
adsp_df_filtered = adsp_df_filtered.drop(adsp_df_filtered[adsp_df_filtered['VISCODE2'] == 'M192'].index)
adsp_df_filtered.shape

In [None]:
# Save the adsp_df_filtered dataframe as a file
# adsp_df_filtered.to_csv('data/ADSP_PHC_COGN_Dec2023_FILTERED_wfiles.csv')

In [None]:
# Get the FC data as numpy arrays
dim_x = len(adsp_df_filtered['FC_DATA'])
features = np.zeros(shape=(dim_x, 100, 200)) # get the first 100 regions

for i, file in enumerate(adsp_df_filtered['FC_DATA'].values):
    arr = loadmat(file)['ROI_activity'][:100, :] # get the first 100 regions
    if arr.shape[1] != 200:
        # add padding to get a constant shape
        diff = 200 - arr.shape[1]
        if diff < 0:
            arr = arr[:, :200]
        else:
            pad_width = ((0, 0), (0, diff))  
            padded_array = np.pad(arr, pad_width, mode='constant', constant_values=0)
    features[i] = padded_array
features.shape

In [None]:
y = adsp_df_filtered[['PHC_MEM', 'PHC_EXF', 'PHC_LAN', 'PHC_VSP']]
y.head()

In [None]:
# split into test + training (80% train, 20% test)
features_2d = features.reshape(features.shape[0], -1)
x_train, x_test, y_train, y_test = train_test_split(features_2d, y, test_size = 0.2, random_state = 28)

## Random Forrest Method
Prediction not differentiable wrt to input - need a model per composite (memory, executive function, language and visuospatial)

In [None]:
# split targets into the different composites

# y_train_mem, y_train_exf, y_train_lan, y_train_vsp = y_train['PHC_MEM'], y_train['PHC_EXF'], y_train['PHC_LAN'], y_train['PHC_VSP']
# y_test_mem, y_test_exf, y_test_lan, y_test_vsp = y_test['PHC_MEM'], y_test['PHC_EXF'], y_test['PHC_LAN'], y_test['PHC_VSP']

#### Memory Model

In [None]:
# MEMORY MODEL

# Remove NaNs in target
y_train_mem = y_train['PHC_MEM'].reset_index(drop=True)
y_test_mem =y_test['PHC_MEM'].reset_index(drop=True)

nan_indices = y_train_mem.index[y_train_mem.isna()]
y_train_mem = y_train_mem.drop(nan_indices)
x_train_mem = np.delete(x_train, nan_indices, axis = 0)
# print(nan_indices)

nan_indices_test = y_test_mem.index[y_test_mem.isna()]
y_test_mem = y_test_mem.drop(nan_indices_test)
x_test_mem = np.delete(x_test, nan_indices_test, axis = 0)

In [None]:
# Initializing the Random Forest Regression model with 10 decision trees
base_model_mem = RandomForestRegressor(n_estimators = 10, random_state = 5)

# Fitting the Random Forest Regression model to the data
base_model_mem.fit(x_train_mem, y_train_mem)

In [None]:
# x_test_mem.shape
# Predicting the target values of the test set
base_y_pred_mem = base_model_mem.predict(x_test_mem)

In [None]:
from sklearn.metrics import r2_score

base_r2_mem = r2_score(y_test_mem, base_y_pred_mem)
print("Baseline R2 (MEM): ", base_r2_mem)

In [None]:
print(base_model_mem.get_params())

#### Random Search for best Hyperparameters

In [35]:
PREDICTOR_TYPE = 'EXF'

y_train_cleaned = y_train[f'PHC_{PREDICTOR_TYPE}'].reset_index(drop=True)
y_test_cleaned =y_test[f'PHC_{PREDICTOR_TYPE}'].reset_index(drop=True)

nan_indices = y_train_cleaned.index[y_train_cleaned.isna()]
y_train_cleaned = y_train_cleaned.drop(nan_indices)
x_train_cleaned = np.delete(x_train, nan_indices, axis = 0)

nan_indices_test = y_test_cleaned.index[y_test_cleaned.isna()]
y_test_cleaned = y_test_cleaned.drop(nan_indices_test)
x_test_cleaned = np.delete(x_test, nan_indices_test, axis = 0)

In [36]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, r2_score

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 100)]

# Number of features to consider at every split
max_features = ['log2', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

{'n_estimators': [100, 119, 138, 157, 176, 195, 215, 234, 253, 272, 291, 311, 330, 349, 368, 387, 407, 426, 445, 464, 483, 503, 522, 541, 560, 579, 598, 618, 637, 656, 675, 694, 714, 733, 752, 771, 790, 810, 829, 848, 867, 886, 906, 925, 944, 963, 982, 1002, 1021, 1040, 1059, 1078, 1097, 1117, 1136, 1155, 1174, 1193, 1213, 1232, 1251, 1270, 1289, 1309, 1328, 1347, 1366, 1385, 1405, 1424, 1443, 1462, 1481, 1501, 1520, 1539, 1558, 1577, 1596, 1616, 1635, 1654, 1673, 1692, 1712, 1731, 1750, 1769, 1788, 1808, 1827, 1846, 1865, 1884, 1904, 1923, 1942, 1961, 1980, 2000], 'max_features': ['log2', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
import json
from joblib import dump

for PREDICTOR_TYPE in ['MEM', 'EXF', 'LAN', 'VSP']:

    # clean the data
    y_train_cleaned = y_train[f'PHC_{PREDICTOR_TYPE}'].reset_index(drop=True)
    y_test_cleaned =y_test[f'PHC_{PREDICTOR_TYPE}'].reset_index(drop=True)

    nan_indices = y_train_cleaned.index[y_train_cleaned.isna()]
    y_train_cleaned = y_train_cleaned.drop(nan_indices)
    x_train_cleaned = np.delete(x_train, nan_indices, axis = 0)

    nan_indices_test = y_test_cleaned.index[y_test_cleaned.isna()]
    y_test_cleaned = y_test_cleaned.drop(nan_indices_test)
    x_test_cleaned = np.delete(x_test, nan_indices_test, axis = 0)
    # ==================================================================

    # Use the random grid to search for best hyperparameters
    
    # First create the base model to tune
    rf = RandomForestRegressor()

    # Evaluation metric
    r2_scorer = make_scorer(r2_score)

    # Random search of parameters, using 5 fold cross validation, 

    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(
        estimator = rf, param_distributions = random_grid, scoring=r2_scorer,
        n_iter = 100, cv = 5, verbose=4, random_state=42, n_jobs = -1)

    # Fit the random search model
    rf_random.fit(x_train_mem, y_train_mem)
    
    # ======================================================================
    
    # Get the best parameter set
    best_params = rf_random.best_params_
    PARAM_FILE = f'{PREDICTOR_TYPE}_best_params.json'

    # Write data to a JSON file
    with open(PARAM_FILE, 'w') as json_file:
        json.dump(best_params, json_file)

    print("\n The best estimator across ALL searched params:\n", rf_random.best_estimator_)
    print("\n The best score across ALL searched params:\n", rf_random.best_score_)
    print("\n The best parameters across ALL searched params:\n", rf_random.best_params_)

    # save the model
    dump(rf_random.best_estimator_, f'best_model_{PREDICTOR_TYPE}.joblib')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
import json
from joblib import dump

# Get the best parameter set
best_params = rf_random.best_params_
PARAM_FILE = f'{PREDICTOR_TYPE}_best_params.json'

# Write data to a JSON file
with open(PARAM_FILE, 'w') as json_file:
    json.dump(best_params, json_file)
    
print("\n The best estimator across ALL searched params:\n", rf_random.best_estimator_)
print("\n The best score across ALL searched params:\n", rf_random.best_score_)
print("\n The best parameters across ALL searched params:\n", rf_random.best_params_)

# save the model
dump(rf_random.best_estimator_, f'best_model_{PREDICTOR_TYPE}.joblib')