## 2) Prediction of cognitive variables with EEG features


- Ridge and random forest regression models are generated using one EEG feature and one cognitive variable (i.e., 175 * 12 models)
- We use repeated train-test split with cross-validation (50 repetitions)
- Out-of-sample prediction scores are calculated using r-squared and RMSE 
- Results are stored in .pkl files in 'results_dir'

Gordillo, da Cruz, Moreno, Garobbio, Herzog

In [None]:
import os
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.base import clone
from sklearn.linear_model import RidgeCV
from dependencies.multicorr import match_et_merge, id_group_match

In [None]:
# main directory
main_dir = os.getcwd()

In [None]:
# define data and results directories
data_dir = os.path.join(main_dir, 'data')
results_dir = os.path.join(main_dir, 'results')
os.chdir(main_dir)

In [None]:
# load behavior variables
beh_vars = ["Cvlt_attention_span", "Cvlt_delayed_memory", "Pts-2_subtest_3",
            "Rwt_animal_categories", "Rwt_s_words", "Tap_alertness",
            "Tap_simon_congruent", "Tap_simon_incongruent", "Tap_working_memory",
            "Tmt-A", "Tmt-B", "Vocabulary_test"]
# cognitive variables
nbeh = len(beh_vars)
beh_tasks = pd.read_csv(os.path.join(data_dir, 'behavior_file.csv'), index_col=0)
# eeg features 
path_eeg_csv = os.path.join(data_dir, 'csv_data')
eeg_features = list(filter(lambda x: '.csv' in x, os.listdir(path_eeg_csv)))
neeg = len(eeg_features)
neeg

In [None]:
# Regression models

# Ridge model
# Variables with zero variance are removed, and each variable is power transformed to improve normality
ridge_model = Pipeline(steps = [('variance', VarianceThreshold()),
                                ('transform', PowerTransformer()),
                                ('ridge', RidgeCV(alphas=np.logspace(-3,5,100)))])

# Random forest model
max_depth = [4, 6, 8, None]
max_features = ['log2', 'sqrt', 'auto']
search_grid = {'rf__max_features' : max_features,
               'rf__max_depth' :    max_depth}

pipe_rf = Pipeline(steps = [('variance', VarianceThreshold()),
                            ('rf', RandomForestRegressor(n_estimators=100, random_state=234))])

# Random forest models are cross-validated using Grid Search and 3-fold cv
rf_model = GridSearchCV(pipe_rf, param_grid=search_grid, cv=3, scoring='neg_mean_absolute_error', verbose=0, n_jobs = -1)

In [None]:
# The entire procedure is repeated 'repeat_n' times
repeat_n = 50

# 'o' = does the analysis for older adults. 'y'= does the analysis for younger adults
idgroup = 'y'

# define random split function
random_sp = ShuffleSplit(n_splits=repeat_n, test_size=0.33,random_state=234)

In [None]:
# allocate memory
ridge_r2_train = np.zeros((repeat_n, neeg, nbeh))
ridge_r2_test = np.zeros((repeat_n, neeg, nbeh))
ridge_rmse_train = np.zeros((repeat_n, neeg, nbeh))
ridge_rmse_test = np.zeros((repeat_n, neeg, nbeh))

rf_r2_train = np.zeros((repeat_n, neeg, nbeh))
rf_r2_test = np.zeros((repeat_n, neeg, nbeh))
rf_rmse_train = np.zeros((repeat_n, neeg, nbeh))
rf_rmse_test = np.zeros((repeat_n, neeg, nbeh))

In [None]:
for ieeg in tqdm(range(neeg)): 
    
    dataeeg = pd.read_csv(os.path.join(path_eeg_csv,eeg_features[ieeg]), index_col=0)
    
    for itask in range(nbeh):
        
        task = beh_vars[itask]
        group_data_beh = beh_tasks.loc[beh_tasks['Group'] == id_group_match(idgroup),[task]]
        matched_data, nanout = match_et_merge(group_data_beh, dataeeg)
        # matched data to fit
        eeg = matched_data.drop(['Group', 'Gender', 'Age', task], axis=1)
        beh = matched_data[task]        
        c_s = 0
        
        # Split data intro train-test
        for train_index, test_index in random_sp.split(eeg):
            
            eeg_train = eeg.values[train_index, :]
            beh_train = beh.values[train_index]
            eeg_test = eeg.values[test_index, :]
            beh_test = beh.values[test_index]
           
            # fit Ridge model on train data with cross-validation
            internal_ridge = clone(ridge_model)
            fit_ridge = internal_ridge.fit(eeg_train, beh_train)
            # predict test data
            predict_train = fit_ridge.predict(eeg_train)
            predict_test = fit_ridge.predict(eeg_test)
            # save performance
            ridge_r2_train[c_s, ieeg, itask] = r2_score(beh_train, predict_train)
            ridge_r2_test[c_s, ieeg, itask] = r2_score(beh_test, predict_test)
            ridge_rmse_train[c_s, ieeg, itask] = mean_squared_error(beh_train, predict_train, squared=False)
            ridge_rmse_test[c_s, ieeg, itask] = mean_squared_error(beh_test, predict_test, squared=False)
            
            # fit RF model on train data with cross-validation
            internal_rf = clone(rf_model)
            fit_rf = internal_rf.fit(eeg_train, beh_train)
            # predict test data
            predict_train = fit_rf.predict(eeg_train)
            predict_test = fit_rf.predict(eeg_test)
            # save performance
            rf_r2_train[c_s, ieeg, itask] = r2_score(beh_train, predict_train)
            rf_r2_test[c_s, ieeg, itask] = r2_score(beh_test, predict_test)
            rf_rmse_train[c_s, ieeg, itask] = mean_squared_error(beh_train, predict_train, squared=False)
            rf_rmse_test[c_s, ieeg, itask] = mean_squared_error(beh_test, predict_test, squared=False)
            
            c_s = c_s + 1
        
        task = []
    dataeeg = []

In [None]:
# save data in pickle
results_2_dir = os.path.join(results_dir, '2_regression_results')

results_reg = {"eeg features": eeg_features,
               "tasks": beh_vars,
               "ridge_r2_train": ridge_r2_train,
               "ridge_r2_test": ridge_r2_test,
               "ridge_rmse_train": ridge_rmse_train,
               "ridge_rmse_test": ridge_rmse_test,
               "rf_r2_train": rf_r2_train,
               "rf_r2_test": rf_r2_test,
               "rf_rmse_train": rf_rmse_train,
               "rf_rmse_test": rf_rmse_test}

pklfile_name = '2_regression_' + idgroup + '.pkl'
with open(os.path.join(results_2_dir, pklfile_name), "wb") as f:
    pickle.dump(results_reg, f)