In [None]:
%matplotlib inline

Between-subject SVR (Support Vector Regression) prediction of latent memory score computed with PCA on battery of neuropysch test scores, based on fMRI contrast (CIMAQ memory encoding task) between task conditions in entire brain's voxels.

Trials (conditions) are classifierd according to:
task condition (encoding or control task)
memory performance (hit vs miss, correct vs incorrect source)
stimulus category (?)

In [None]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import nilearn
import scipy
import nibabel as nb
import sklearn
import seaborn as sns
import itertools

from numpy import nan as NaN
from matplotlib import pyplot as plt
from nilearn import image, plotting
from nilearn import masking
from nilearn import plotting
from nilearn import datasets
from nilearn.plotting import plot_stat_map, plot_roi, plot_anat, plot_img, show
from nilearn.input_data import NiftiMasker
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import MinMaxScaler


Step 1: import list of participants, and generate sublists of participants who have enough trials per category to have proper contrasts between conditions 

1. Encoding vs Control tasks contrast (all 94 participants)

3. Hit versus Miss contrast (? participants; at least 10 trials per condition)

4. Correct Source versus Wrong Source (? participants; at least 10 trials per condition)

5. Correct Source versus Miss (? participants; at least 10 trials per condition)


In [None]:
# Path to directory with participant lists
data_file = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Participants/Splitting/Sub_list.tsv'
sub_data = pd.read_csv(data_file, sep = '\t')

# Exclude participants who failed QC
sub_data = sub_data[sub_data['QC_status']!= 'F']

# Set minimal number of trials needed per subject to include them in analysis
num = 9

# Encoding vs Control, and Stimulus Category classifications
all_subs = sub_data['participant_id']
all_diagnosis = sub_data['cognitive_status']
all_memScore = sub_data['Fac1_memory']
print(all_subs)
print(len(all_subs))

# Hit versus Miss
hm_data = sub_data[sub_data['hits'] > num]
hm_data = hm_data[hm_data['miss'] > num]
hm_subs = hm_data['participant_id']
hm_diagnosis = hm_data['cognitive_status']
hm_memScore = hm_data['Fac1_memory']
print(hm_subs)
print(len(hm_subs))

# Correct Source versus Wrong Source 
cw_data = sub_data[sub_data['correct_source'] > num]
cw_data = cw_data[cw_data['wrong_source'] > num]
cw_subs = cw_data['participant_id']
cw_diagnosis = cw_data['cognitive_status']
cw_memScore = cw_data['Fac1_memory']
print(cw_subs)
print(len(cw_subs))

# Correct Source versus Miss
cmiss_data = sub_data[sub_data['correct_source'] > num]
cmiss_data = cmiss_data[cmiss_data['miss'] > num]
cmiss_subs = cmiss_data['participant_id']
cmiss_diagnosis = cmiss_data['cognitive_status']
cmiss_memScore = cmiss_data['Fac1_memory']
print(cmiss_subs)
print(len(cmiss_subs))


Step 2: For each subject list (analysis), create a group mask from individual functional mri masks.

The mask should only include voxels included in all participants's individual functional mask (intersection). The mask will serve to vectorize 3D beta weigths maps into feature rows.
**Update: use 0.5 treshold, otherwise too much signal drop out**


In [None]:
# Anatomical template for display
anat = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Templates/template_anat_stereo.nii'

# Path to directory with masks
mask_dir = '/Users/mombot/Documents/Simexp/CIMAQ/Data/masks'

# All participants (94 participants)
all_mask_list = []
for sub in all_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    all_mask_list.append(mask)
print(len(all_mask_list))    
grp_mask_all = masking.intersect_masks(mask_imgs = all_mask_list, threshold=0.5, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_all, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
#plotting.view_img(grp_mask_all, bg_img=anat, resampling_interpolation='nearest')

# Hit versus miss (49 participants)
hm_mask_list = []
for sub in hm_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    hm_mask_list.append(mask)
print(len(hm_mask_list))    
grp_mask_hm = masking.intersect_masks(mask_imgs = hm_mask_list, threshold=0.50, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_hm, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
#plotting.view_img(grp_mask_hm, bg_img=anat, resampling_interpolation='nearest')

# Correct Source versus Wrong Source (49 participants)
cw_mask_list = []
for sub in cw_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    cw_mask_list.append(mask)
print(len(cw_mask_list))    
grp_mask_cw = masking.intersect_masks(mask_imgs = cw_mask_list, threshold=0.50, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_cw, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
#plotting.view_img(grp_mask_cw, bg_img=anat, resampling_interpolation='nearest')


# Correct Source versus Miss (38 participants)
cmiss_mask_list = []
for sub in cmiss_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    cmiss_mask_list.append(mask)
print(len(cmiss_mask_list))    
grp_mask_cmiss = masking.intersect_masks(mask_imgs = cmiss_mask_list, threshold=0.50, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_cw, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
# plotting.view_img(grp_mask_cmiss, bg_img=anat, resampling_interpolation='nearest')


Step 3: For each categorization, randomly assign and split participants into a training set and a test set.

Note: stratify to maintain comparable proportions of Cognitively Normal (Controls), Subjective Cognitive Disorder (SCD) and Mild Cognitive Impairment (MCI) participants between the testing and training sets.


In [None]:

# Encoding vs Control Task Conditions
enc_ctl_train, enc_ctl_test, y_enc_ctl_train, y_enc_ctl_test = train_test_split(
    all_subs, # list of subjects to split
    all_memScore, # list of scores to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = all_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 123)

print('enc_ctl training subjects:', len(enc_ctl_train),
      'enc_ctl training scores:', len(y_enc_ctl_train),
      'enc_ctl testing subjects:', len(enc_ctl_test),
     'enc_ctl testing scores:', len(y_enc_ctl_test))


# Hit vs Miss Trials
hit_miss_train, hit_miss_test, y_hit_miss_train, y_hit_miss_test = train_test_split(
    hm_subs, # list of subjects to split
    hm_memScore, # list of scores to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = hm_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 52)

print('hit_miss training subjects:', len(hit_miss_train),
      'hit_miss training scores:', len(y_hit_miss_train),
      'hit_miss testing subjects:', len(hit_miss_test),
     'hit_miss testing scores:', len(y_hit_miss_test))


# Correct Source vs Wrong Source Trials
cs_ws_train, cs_ws_test, y_cs_ws_train, y_cs_ws_test = train_test_split(
    cw_subs, # list of subjects to split
    cw_memScore, # list of scores to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = cw_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 46)

print('cs_ws training subjects:', len(cs_ws_train),
      'cs_ws training scores:', len(y_cs_ws_train),
      'cs_ws testing subjects:', len(cs_ws_test),
     'cs_ws testing scores:', len(y_cs_ws_test))


# Correct Source vs Miss Trials

cs_miss_train, cs_miss_test, y_cs_miss_train, y_cs_miss_test = train_test_split(
    cmiss_subs, # list of subjects to split
    cmiss_memScore, # list of scores to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = cmiss_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 103)

print('cs_miss training subjects:', len(cs_miss_train),
      'cs_miss training scores:', len(y_cs_miss_train),
      'cs_miss testing subjects:', len(cs_miss_test),
     'cs_miss testing scores:', len(y_cs_miss_test))


Step 4: Build training and testing feature matrices

For each participant:
- With nilearn's NiftiMasker, vectorize single 3D beta of task contrast (computer in Nistats, 1st level model) to derive features for classification. The NiftiMasker converts 4D beta-images into a 2D a vectorized data matrix (each 3D beta map becomes a 1D vector; rows = trials, columns = voxels) as input for machine learning.
Here, a single row of features per participant
- concatenate the participant's fmri and label data into two matrices (fmri and labels). There should be two matrices per set (train and test) per analysis.

Note: 
The NiftiMasker converts 4D beta-images into a 2D a vectorized data matrix (each 3D beta map becomes a 1D vector; rows = trials, columns = voxels) as input for machine learning.

Masking: using a group mask built from the intersection of normalized functional MRI data masks (outputted by NIAK), to determine which voxels to include in the final data matrix

In [None]:
# set paths to directories of interest
beta_dir = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Nistats/Betas'


# ENCODING VERSUS CONTROL TASK CONTRAST

#####TO FIX!!!

# For each set (training and test), create an empty numpy array to store 
# concatenated vectorized beta maps (one row per trial; size = trials * voxels).
# 1. determine the number of rows needed (sum of trials per participants in set)
numrow_train = 0
numrow_test = 0
for sub in enc_ctl_train:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_enco_ctl.tsv')
    y_enco_ctl = pd.read_csv(labels_file, sep='\t')
    numrow_train = numrow_train + y_enco_ctl.shape[0]

for sub in enc_ctl_test:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_enco_ctl.tsv')
    y_enco_ctl = pd.read_csv(labels_file, sep='\t')
    numrow_test = numrow_test + y_enco_ctl.shape[0]    
    
print('number of trials in the training set: ', numrow_train,
     'number of trials in the test set: ', numrow_test)    

# 2. determine the number of columns needed (voxels in the vectorized group mask)
masker = NiftiMasker(mask_img=grp_mask_all, standardize=False)
numvox = masker.fit_transform(grp_mask_all).shape[1]

# 3. create an empty numpy array to store the data 
X_enc_ctl_train = np.empty(shape=(numrow_train, numvox))
X_enc_ctl_test = np.empty(shape=(numrow_test, numvox))
# X_enc_ctl_train = np.zeros(shape=(0, numvox))
# X_enc_ctl_test = np.zeros(shape=(0, numvox))

print(X_enc_ctl_train.shape, X_enc_ctl_test.shape)

# 4. create empty dataframes to store trial labels (one per set)
y_enc_ctl_train = pd.DataFrame()
y_enc_ctl_train.insert(loc = 0, column = 'condition', value = 'TBD', allow_duplicates=True)
y_enc_ctl_train.insert(loc = 1, column = 'dccid', value = 'TBD', allow_duplicates=True)
y_enc_ctl_train.insert(loc = 2, column = 'trialnum', value = 'NaN', allow_duplicates=True)

y_enc_ctl_test = y_enc_ctl_train.copy()

# 5. Create a masker object to vectorize beta maps; 
# one map per trial becomes its own row in X_data matrix
enc_ctl_masker = NiftiMasker(mask_img=grp_mask_all, standardize=True)

