# Random Forrest Predictor for Cognitive Score

In [1]:
import pandas as pd
import numpy as np
from scipy.io import loadmat

# scikit-learn modules
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.ensemble import RandomForestRegressor # for building the model

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR, NuSVR
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import json
from joblib import dump

In [2]:
def normalise_correlate_fc(fc):
    fc_emp = fc / np.max(fc)
    fc_emp = np.corrcoef(fc_emp)
    return fc_emp

In [3]:
# evaluate 
def eval(model, x_test_scaled, y_test):
    y_pred = model.predict(x_test_scaled)
    r2_score = round(metrics.r2_score(y_test, y_pred),2)

    print(f'r2: {r2_score}')

In [4]:
ADSP_DATA = '../data/ADSP_PHC_COGN_Dec2023_FILTERED_wfiles.csv'

df_mem = pd.read_csv(ADSP_DATA)
df_exf = pd.read_csv(ADSP_DATA)
df_lan = pd.read_csv(ADSP_DATA)
df_vsp = pd.read_csv(ADSP_DATA)

df_mem = df_mem.drop(columns=['RID', 'VISCODE2', 'PHC_Diagnosis', 'PHC_EXF', 'PHC_LAN', 'PHC_VSP'])
df_exf = df_exf.drop(columns=['RID', 'VISCODE2', 'PHC_Diagnosis', 'PHC_MEM', 'PHC_LAN', 'PHC_VSP'])
df_lan = df_lan.drop(columns=['RID', 'VISCODE2', 'PHC_Diagnosis', 'PHC_EXF', 'PHC_MEM', 'PHC_VSP'])
df_vsp = df_vsp.drop(columns=['RID', 'VISCODE2', 'PHC_Diagnosis', 'PHC_EXF', 'PHC_LAN', 'PHC_MEM'])
df_mem.shape

(1343, 3)

In [5]:
df_lan.dropna(subset=['PHC_LAN'], inplace=True)
df_lan.shape

(1343, 3)

In [6]:
train, test = train_test_split(df_lan, test_size=0.2, random_state=42)

# Get targets for training + testing each predictor
y_train, y_test = train['PHC_LAN'], test['PHC_LAN']

In [7]:
# Get features for training and testing

# Get the FC data as numpy arrays
dim_x = len(train)
x_train = []
x_test = []

for i, file in enumerate(train['FC_DATA'].values):
    arr = loadmat(file)['ROI_activity'][:100, :] # get the first 100 regions
    fc = normalise_correlate_fc(arr)
    x_train.append(fc)

for i, file in enumerate(test['FC_DATA'].values):
    arr = loadmat(file)['ROI_activity'][:100, :] # get the first 100 regions
    fc = normalise_correlate_fc(arr)
    x_test.append(fc)

In [8]:
x_train = np.array(x_train)
x_test = np.array(x_test)

x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

In [9]:
# # Initializing the Random Forest Regression model with 100 decision trees
# base_model_mem = RandomForestRegressor(n_estimators = 1000, random_state = 5)

# # Fitting the Random Forest Regression model to the data
# base_model_mem.fit(x_train, y_train)
# eval(base_model_mem, x_test, y_test)

## Hyperparameter Tuning

In [10]:
# Random Search
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 100)]

# Number of features to consider at every split
max_features = ['log2', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth = [int(x) for x in np.linspace(1, 20, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 119, 138, 157, 176, 195, 215, 234, 253, 272, 291, 311, 330, 349, 368, 387, 407, 426, 445, 464, 483, 503, 522, 541, 560, 579, 598, 618, 637, 656, 675, 694, 714, 733, 752, 771, 790, 810, 829, 848, 867, 886, 906, 925, 944, 963, 982, 1002, 1021, 1040, 1059, 1078, 1097, 1117, 1136, 1155, 1174, 1193, 1213, 1232, 1251, 1270, 1289, 1309, 1328, 1347, 1366, 1385, 1405, 1424, 1443, 1462, 1481, 1501, 1520, 1539, 1558, 1577, 1596, 1616, 1635, 1654, 1673, 1692, 1712, 1731, 1750, 1769, 1788, 1808, 1827, 1846, 1865, 1884, 1904, 1923, 1942, 1961, 1980, 2000], 'max_features': ['log2', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4, 8], 'bootstrap': [True, False]}


In [11]:
# Use the random grid to search for best hyperparameters
PREDICTOR_TYPE='MEM'

# First create the base model to tune
rf = RandomForestRegressor()

# Evaluation metric
r2_scorer = metrics.make_scorer(metrics.r2_score)

# Random search of parameters, using 5 fold cross validation, 

# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf, param_distributions = random_grid, scoring=r2_scorer,
    n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(x_train, y_train)

# ======================================================================

best_model = rf_random.best_estimator_

print("\n The best estimator across ALL searched params:\n", best_model)
print("\n The best score across ALL searched params:\n", rf_random.best_score_)
print("\n The best parameters across ALL searched params:\n", rf_random.best_params_)

# save the model
dump(best_model, f'best_model_{PREDICTOR_TYPE}_random_corr.joblib')

eval(best_model, x_test, y_test)

# Get the best parameter set
details = {}
PARAM_FILE = f'{PREDICTOR_TYPE}_best_params_random_corr.json'

details['params'] = rf_random.best_params_
details['score'] = rf_random.best_score_

# Write data to a JSON file
with open(PARAM_FILE, 'w') as json_file:
    json.dump(details, json_file)
        

Fitting 5 folds for each of 100 candidates, totalling 500 fits

 The best estimator across ALL searched params:
 RandomForestRegressor(bootstrap=False, max_depth=40, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=464)

 The best score across ALL searched params:
 0.06333606319634008

 The best parameters across ALL searched params:
 {'n_estimators': 464, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
r2: -0.06
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1712; total time= 3.2min
[CV] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=8, min_samples_split=2, n_estimators=694; total time=  13.3s
[CV] END bootstrap=False, max_depth=60, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1827; total time=  48.7s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=8, mi