In [4]:
%matplotlib inline

Between-subject SVM classification based on entire brain's voxels for CIMAQ memory encoding task (fMRI data).
Trials (conditions) are classifierd according to:
- task condition (encoding or control task)
- memory performance (hit vs miss, correct vs incorrect source)
- stimulus category (?)

In [5]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import nilearn
import scipy
import nibabel as nb
import sklearn
import seaborn as sns
import itertools

from numpy import nan as NaN
from matplotlib import pyplot as plt
from nilearn import image, plotting
from nilearn import masking
from nilearn import plotting
from nilearn import datasets
from nilearn.plotting import plot_stat_map, plot_roi, plot_anat, plot_img, show
from nilearn.input_data import NiftiMasker
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import MinMaxScaler


Step 1: import list of participants, and generate sublists of participants who have enough trials per category for each classification

1. Encoding vs Control task conditions (all 94)

2. Stimulus category (all 94)

3. Hit versus Miss (42 participants; at least 15 trials per condition)

4. Correct Source versus Wrong Source (49 participants; at least 15 trials per condition)

5. Correct Source versus Miss (38 participants; at least 15 trials per condition)


In [6]:
# Path to directory with participant lists
data_file = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Participants/Splitting/Sub_list.tsv'
sub_data = pd.read_csv(data_file, sep = '\t')

# Exclude participants who failed QC
sub_data = sub_data[sub_data['QC_status']!= 'F']

# Set minimal number of trials needed per subject to include them in analysis
num = 14

# Encoding vs Control, and Stimulus Category classifications
all_subs = sub_data['participant_id']
all_diagnosis = sub_data['cognitive_status']
print(all_subs)
print(len(all_subs))

# Hit versus Miss
hm_data = sub_data[sub_data['hits'] > num]
hm_data = hm_data[hm_data['miss'] > num]
hm_subs = hm_data['participant_id']
hm_diagnosis = hm_data['cognitive_status']
print(hm_subs)
print(len(hm_subs))

# Correct Source versus Wrong Source 
cw_data = sub_data[sub_data['correct_source'] > num]
cw_data = cw_data[cw_data['wrong_source'] > num]
cw_subs = cw_data['participant_id']
cw_diagnosis = cw_data['cognitive_status']
print(cw_subs)
print(len(cw_subs))

# Correct Source versus Miss
cmiss_data = sub_data[sub_data['correct_source'] > num]
cmiss_data = cmiss_data[cmiss_data['miss'] > num]
cmiss_subs = cmiss_data['participant_id']
cmiss_diagnosis = cmiss_data['cognitive_status']
print(cmiss_subs)
print(len(cmiss_subs))


0      108391
1      120839
2      122922
3      127228
4      139593
6      147863
7      150649
8      164965
9      175295
10     178101
11     189005
12     197192
14     199801
15     219637
16     229301
17     247659
18     254402
19     255499
20     258618
21     258912
22     267168
23     270218
24     271596
27     314409
28     326073
29     336665
30     337021
31     350555
32     370092
34     385370
        ...  
70     763590
71     778749
72     783781
73     785217
74     785245
75     804743
77     845675
78     866812
79     878354
80     884343
81     886007
83     893978
85     901551
86     906145
87     914042
88     915022
89     920577
90     932933
91     936730
92     938001
93     955548
94     956049
95     956130
96     968913
97     974246
98     979001
99     983291
100    988602
101    996599
102    998166
Name: participant_id, Length: 94, dtype: int64
94
0      108391
2      122922
4      139593
8      164965
14     199801
17     247659
19     25549

Step 2: For each subject list (analysis), create a group mask from individual functional mri masks.  

The mask should only include voxels included in all participants's individual functional mask (intersection). The mask will serve to vectorize 3D beta weigths maps into feature rows. 

**Update: use 0.5 treshold, otherwise too much signal drop out**

In [8]:
# Anatomical template for display
anat = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Templates/template_anat_stereo.nii'

# Path to directory with masks
mask_dir = '/Users/mombot/Documents/Simexp/CIMAQ/Data/masks'

# All participants (94 participants)
all_mask_list = []
for sub in all_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    all_mask_list.append(mask)
print(len(all_mask_list))    
grp_mask_all = masking.intersect_masks(mask_imgs = all_mask_list, threshold=0.5, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_all, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
#plotting.view_img(grp_mask_all, bg_img=anat, resampling_interpolation='nearest')

# Hit versus miss (49 participants)
hm_mask_list = []
for sub in hm_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    hm_mask_list.append(mask)
print(len(hm_mask_list))    
grp_mask_hm = masking.intersect_masks(mask_imgs = hm_mask_list, threshold=0.50, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_hm, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
#plotting.view_img(grp_mask_hm, bg_img=anat, resampling_interpolation='nearest')

# Correct Source versus Wrong Source (49 participants)
cw_mask_list = []
for sub in cw_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    cw_mask_list.append(mask)
print(len(cw_mask_list))    
grp_mask_cw = masking.intersect_masks(mask_imgs = cw_mask_list, threshold=0.50, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_cw, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
#plotting.view_img(grp_mask_cw, bg_img=anat, resampling_interpolation='nearest')


# Correct Source versus Miss (38 participants)
cmiss_mask_list = []
for sub in cmiss_subs:
    mask = os.path.join(mask_dir, 'func_sub'+str(sub)+'_mask_stereonl.nii')
    cmiss_mask_list.append(mask)
print(len(cmiss_mask_list))    
grp_mask_cmiss = masking.intersect_masks(mask_imgs = cmiss_mask_list, threshold=0.50, connected=True)    

# plotting.plot_roi(roi_img=grp_mask_cw, bg_img=anat, cut_coords=(0, -7, -7), cmap='Paired')
# plotting.view_img(grp_mask_cmiss, bg_img=anat, resampling_interpolation='nearest')



94
42
49
38


Step 3: For each categorization, randomly assign and split participants into 
a **training** set and a **test** set

Note: stratify to maintain comparable proportions of Cognitively Normal (Controls), Subjective Cognitive Disorder (SCD) and Mild Cognitive Impairment (MCI) participants between the testing and training sets 

In [10]:
# Encoding vs Control Task Conditions

enc_ctl_train, enc_ctl_test = train_test_split(
    all_subs, # list to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = all_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 123)

print('enc_ctl training:', len(enc_ctl_train),
     'enc_ctl testing:', len(enc_ctl_test))


# Hit vs Miss Trials

hit_miss_train, hit_miss_test = train_test_split(
    hm_subs, # list to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = hm_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 52)

print('hit_miss training:', len(hit_miss_train),
     'hit_miss testing:', len(hit_miss_test))


# Correct Source vs Wrong Source Trials

cs_ws_train, cs_ws_test = train_test_split(
    cw_subs, # list to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = cw_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 46)

print('cs_ws training:', len(cs_ws_train),
     'cs_ws testing:', len(cs_ws_test))


# Correct Source vs Miss Trials

cs_miss_train, cs_miss_test = train_test_split(
    cmiss_subs, # list to split
    test_size = 0.4, # 60%/40% split between train and test
    shuffle= True, 
    stratify = cmiss_diagnosis, # keep consistent proportions of Controls, SCDs and MCIs between sets
    random_state = 103)

print('cs_miss training:', len(cs_miss_train),
     'cs_miss testing:', len(cs_miss_test))



enc_ctl training: 56 enc_ctl testing: 38
hit_miss training: 25 hit_miss testing: 17
cs_ws training: 29 cs_ws testing: 20
cs_miss training: 22 cs_miss testing: 16


Step 4: Build training and testing feature matrices

For each participant:
- vectorize 3D beta maps (loaded in temporal order) with nilearn's NiftiMasker to derive features for classification. The NiftiMasker converts 4D beta-images into a 2D a vectorized data matrix (each 3D beta map becomes a 1D vector; rows = trials, columns = voxels) as input for machine learning.
- load the trial labels
- mask the data (fmri and labels) that correspond to trials of interest
- concatenate the participant's fmri and label data into two matrices (fmri and labels). There should be two matrices per set (train and test) per analysis.

Note: 
The NiftiMasker converts 4D beta-images into a 2D a vectorized data matrix (each 3D beta map becomes a 1D vector; rows = trials, columns = voxels) as input for machine learning.

Masking: using a group mask built from the intersection of normalized functional MRI data masks (outputted by NIAK), to determine which voxels to include in the final data matrix


In [11]:
# set paths to directories of interest
beta_dir = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Nistats/Betas'
label_dir = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Nistats/Events'
output_dir = '/Users/mombot/Documents/Simexp/CIMAQ/Data/Nilearn/features'

In [12]:

# ENCODING VERSUS CONTROL TASK CLASSIFICATION

# For each set (training and test), create an empty numpy array to store 
# concatenated vectorized beta maps (one row per trial; size = trials * voxels).
# 1. determine the number of rows needed (sum of trials per participants in set)
numrow_train = 0
numrow_test = 0
for sub in enc_ctl_train:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_enco_ctl.tsv')
    y_enco_ctl = pd.read_csv(labels_file, sep='\t')
    numrow_train = numrow_train + y_enco_ctl.shape[0]

for sub in enc_ctl_test:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_enco_ctl.tsv')
    y_enco_ctl = pd.read_csv(labels_file, sep='\t')
    numrow_test = numrow_test + y_enco_ctl.shape[0]    
    
print('number of trials in the training set: ', numrow_train,
     'number of trials in the test set: ', numrow_test)    

# 2. determine the number of columns needed (voxels in the vectorized group mask)
masker = NiftiMasker(mask_img=grp_mask_all, standardize=False)
numvox = masker.fit_transform(grp_mask_all).shape[1]

# 3. create an empty numpy array to store the data 
X_enc_ctl_train = np.empty(shape=(numrow_train, numvox))
X_enc_ctl_test = np.empty(shape=(numrow_test, numvox))
# X_enc_ctl_train = np.zeros(shape=(0, numvox))
# X_enc_ctl_test = np.zeros(shape=(0, numvox))

print(X_enc_ctl_train.shape, X_enc_ctl_test.shape)

# 4. create empty dataframes to store trial labels (one per set)
y_enc_ctl_train = pd.DataFrame()
y_enc_ctl_train.insert(loc = 0, column = 'condition', value = 'TBD', allow_duplicates=True)
y_enc_ctl_train.insert(loc = 1, column = 'dccid', value = 'TBD', allow_duplicates=True)
y_enc_ctl_train.insert(loc = 2, column = 'trialnum', value = 'NaN', allow_duplicates=True)

y_enc_ctl_test = y_enc_ctl_train.copy()

# 5. Create a masker object to vectorize beta maps; 
# one map per trial becomes its own row in X_data matrix
enc_ctl_masker = NiftiMasker(mask_img=grp_mask_all, standardize=True)


number of trials in the training set:  6526 number of trials in the test set:  4420
(6526, 70015) (4420, 70015)


In [13]:

# 6. Fill the X (beta weights per voxel) and y (trial labels) data matrices

# note: nilearn.image.load_img concatenates 3D beta maps in alphabetical order
# trial numbers must be PADDED with zeros to preserve their temporal order when alphabetized

# TRAINING SET
j = 0
for sub in enc_ctl_train: 
    print(sub)
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    sub_trials = enc_ctl_masker.fit_transform(betas)
    X_enc_ctl_train[j:(j+sub_trials.shape[0]), :] = sub_trials
    j = j + sub_trials.shape[0]
    #X_enc_ctl_train = np.append(X_enc_ctl_train, sub_trials, axis = 0)
    print('number of X filled rows: ', j, 
          'subject X_shape:', sub_trials.shape,
         'total X_shape:', X_enc_ctl_train.shape)
    
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_enco_ctl.tsv')
    y_enco_ctl = pd.read_csv(labels_file, sep='\t')
    trialnum = y_enco_ctl.index
    y_enco_ctl.insert(loc = y_enco_ctl.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_enco_ctl.insert(loc = y_enco_ctl.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_enc_ctl_train= y_enc_ctl_train.append(y_enco_ctl, ignore_index=True)
    print('subject y_shape:', y_enco_ctl.shape,
         'total y_shape:', y_enc_ctl_train.shape)
    print(y_enco_ctl.condition.value_counts())

print('The training data set is built!') 

# TESTING SET
j = 0
for sub in enc_ctl_test: 
    print(sub)
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    sub_trials = enc_ctl_masker.fit_transform(betas)
    X_enc_ctl_test[j:(j+sub_trials.shape[0]), :] = sub_trials
    j = j + sub_trials.shape[0]
    #X_enc_ctl_test = np.append(X_enc_ctl_train, sub_trials, axis = 0)
    print('number of X filled rows: ', j, 
          'subject X_shape:', sub_trials.shape,
         'total X_shape:', X_enc_ctl_test.shape)
    
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_enco_ctl.tsv')
    y_enco_ctl = pd.read_csv(labels_file, sep='\t')
    trialnum = y_enco_ctl.index
    y_enco_ctl.insert(loc = y_enco_ctl.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_enco_ctl.insert(loc = y_enco_ctl.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_enc_ctl_test= y_enc_ctl_test.append(y_enco_ctl, ignore_index=True)
    print('subject y_shape:', y_enco_ctl.shape,
         'total y_shape:', y_enc_ctl_test.shape)
    print(y_enco_ctl.condition.value_counts())

print('The testing data set is built!') 

# 7. extract the label columns from the y data dataframes (to input model)
y_enco_ctl_labels_train = y_enc_ctl_train['condition']
y_enco_ctl_labels_test = y_enc_ctl_test['condition']
 

878354
number of X filled rows:  117 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (117, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
955548
number of X filled rows:  234 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (234, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
983291
number of X filled rows:  351 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (351, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
968913
number of X filled rows:  468 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (468, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
956049
number of X filled rows:  585 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (585, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
893978
number of X f

number of X filled rows:  5013 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (5013, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
763590
number of X filled rows:  5126 subject X_shape: (113, 70015) total X_shape: (6526, 70015)
subject y_shape: (113, 3) total y_shape: (5126, 3)
Enc    75
CTL    38
Name: condition, dtype: int64
884343
number of X filled rows:  5243 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (5243, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
652850
number of X filled rows:  5360 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (5360, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
956130
number of X filled rows:  5477 subject X_shape: (117, 70015) total X_shape: (6526, 70015)
subject y_shape: (117, 3) total y_shape: (5477, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
502616
number of 

number of X filled rows:  3377 subject X_shape: (117, 70015) total X_shape: (4420, 70015)
subject y_shape: (117, 3) total y_shape: (3377, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
229301
number of X filled rows:  3490 subject X_shape: (113, 70015) total X_shape: (4420, 70015)
subject y_shape: (113, 3) total y_shape: (3490, 3)
Enc    75
CTL    38
Name: condition, dtype: int64
932933
number of X filled rows:  3607 subject X_shape: (117, 70015) total X_shape: (4420, 70015)
subject y_shape: (117, 3) total y_shape: (3607, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
108391
number of X filled rows:  3724 subject X_shape: (117, 70015) total X_shape: (4420, 70015)
subject y_shape: (117, 3) total y_shape: (3724, 3)
Enc    78
CTL    39
Name: condition, dtype: int64
462345
number of X filled rows:  3839 subject X_shape: (115, 70015) total X_shape: (4420, 70015)
subject y_shape: (115, 3) total y_shape: (3839, 3)
Enc    76
CTL    39
Name: condition, dtype: int64
915022
number of 

In [97]:

# HITS VERSUS MISS TRIAL CLASSIFICATION

## Important: keep only the trials of interest (HIT or MISSED, not CONTROL)
## To filter trials, create a mask from the labels column (extracted from labels file)
## Apply this mask to the 2D fMRI data matrix: keep only hit or missed trials (rows)
## Apply the same mask to the labels matrix (exclude labels of no interest)

# For each set (training and test), create an empty numpy array to store 
# concatenated vectorized beta maps (one row per trial; size = trials * voxels).
# 1. determine the number of rows needed (sum of trials per participants in set)
numrow_train = 0
numrow_test = 0
for sub in hit_miss_train:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_hit.tsv')
    y_hit_miss_ctl = pd.read_csv(labels_file, sep='\t')
    y_hit_miss_ctl_labels = y_hit_miss_ctl['ctl_miss_hit']
    hit_miss_mask = y_hit_miss_ctl_labels.isin(['hit', 'missed'])
    y_hit_miss = y_hit_miss_ctl[hit_miss_mask]
    numrow_train = numrow_train + y_hit_miss.shape[0]

for sub in hit_miss_test:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_hit.tsv')
    y_hit_miss_ctl = pd.read_csv(labels_file, sep='\t')
    y_hit_miss_ctl_labels = y_hit_miss_ctl['ctl_miss_hit']
    hit_miss_mask = y_hit_miss_ctl_labels.isin(['hit', 'missed'])
    y_hit_miss = y_hit_miss_ctl[hit_miss_mask]
    numrow_test = numrow_test + y_hit_miss.shape[0]    
    
print('number of trials in the training set: ', numrow_train,
     'number of trials in the test set: ', numrow_test)     

# 2. determine the number of columns needed (voxels in the vectorized group mask)
masker = NiftiMasker(mask_img=grp_mask_hm, standardize=False)
numvox = masker.fit_transform(grp_mask_hm).shape[1]

# 3. create an empty numpy array to store the data 
X_hit_miss_train = np.empty(shape=(numrow_train, numvox))
X_hit_miss_test = np.empty(shape=(numrow_test, numvox))

print(X_hit_miss_train.shape, X_hit_miss_test.shape)

# 4. create empty dataframes to store trial labels (one per set)
y_hit_miss_train = pd.DataFrame()
y_hit_miss_train.insert(loc = 0, column = 'ctl_miss_hit', value = 'TBD', allow_duplicates=True)
y_hit_miss_train.insert(loc = 1, column = 'dccid', value = 'TBD', allow_duplicates=True)
y_hit_miss_train.insert(loc = 2, column = 'trialnum', value = 'NaN', allow_duplicates=True)

y_hit_miss_test = y_hit_miss_train.copy()

# 5. create a masker object to vectorize the beta maps; 
# one map per trial becomes its own row in X_data matrix
hit_miss_masker = NiftiMasker(mask_img=grp_mask_hm, standardize=True)


number of trials in the training set:  1928 number of trials in the test set:  1319
(1928, 69835) (1319, 69835)


In [98]:

# 6. Fill the X (beta weights per voxel) and y (trial labels) data matrices

# note: nilearn.image.load_img concatenates 3D beta maps in alphabetical order
# trial numbers must be PADDED with zeros to preserve their temporal order when alphabetized

# TRAINING SET
j = 0
for sub in hit_miss_train: 
    print(sub)
    # load labels file as pandas dataframe
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_hit.tsv')
    y_hit_miss_ctl = pd.read_csv(labels_file, sep='\t')
    
    # create a boolean mask to filter out trials of no interest
    y_hit_miss_ctl_labels = y_hit_miss_ctl['ctl_miss_hit']
    hit_miss_mask = y_hit_miss_ctl_labels.isin(['hit', 'missed'])

    # apply mask to labels dataframe to keep only hit and missed trials
    y_hit_miss = y_hit_miss_ctl[hit_miss_mask]
    
    # process and append y data to the set's labels dataframe
    trialnum = y_hit_miss.index
    y_hit_miss.insert(loc = y_hit_miss.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_hit_miss.insert(loc = y_hit_miss.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_hit_miss_train= y_hit_miss_train.append(y_hit_miss, ignore_index=True)
    print(y_hit_miss.ctl_miss_hit.value_counts())
    print('subject y_shape:', y_hit_miss.shape,
         'total y_shape:', y_hit_miss_train.shape)
    
    # load and concatenate beta maps
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    # vectorize beta maps into a 2D numpy array
    sub_trials = hit_miss_masker.fit_transform(betas)
    # mask array to filter out trials of no interest
    hm_sub_trials = sub_trials[hit_miss_mask]
    
    # copy filtered vectorized values into set's X data array
    X_hit_miss_train[j:(j+hm_sub_trials.shape[0]), :] = hm_sub_trials
    j = j + hm_sub_trials.shape[0]
    print('number of X filled rows: ', j,
          'subject X_shape:', hm_sub_trials.shape,
          'total X_shape:', X_hit_miss_train.shape)

print('The training data set is built!') 
 

# TESTING SET
j = 0
for sub in hit_miss_test: 
    print(sub)
    # load labels file as pandas dataframe
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_hit.tsv')
    y_hit_miss_ctl = pd.read_csv(labels_file, sep='\t')
    
    # create a boolean mask to filter out trials of no interest
    y_hit_miss_ctl_labels = y_hit_miss_ctl['ctl_miss_hit']
    hit_miss_mask = y_hit_miss_ctl_labels.isin(['hit', 'missed'])

    # apply mask to labels dataframe to keep only hit and missed trials
    y_hit_miss = y_hit_miss_ctl[hit_miss_mask]
    
    # process and append y data to the set's labels dataframe
    trialnum = y_hit_miss.index
    y_hit_miss.insert(loc = y_hit_miss.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_hit_miss.insert(loc = y_hit_miss.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_hit_miss_test= y_hit_miss_test.append(y_hit_miss, ignore_index=True)
    print(y_hit_miss.ctl_miss_hit.value_counts())
    print('subject y_shape:', y_hit_miss.shape,
         'total y_shape:', y_hit_miss_test.shape)
    
    # load and concatenate beta maps
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    # vectorize beta maps into a 2D numpy array
    sub_trials = hit_miss_masker.fit_transform(betas)
    # mask array to filter out trials of no interest
    hm_sub_trials = sub_trials[hit_miss_mask]
    
    # copy filtered vectorized values into set's X data array
    X_hit_miss_test[j:(j+hm_sub_trials.shape[0]), :] = hm_sub_trials
    j = j + hm_sub_trials.shape[0]
    print('number of X filled rows: ', j,
          'subject X_shape:', hm_sub_trials.shape,
          'total X_shape:', X_hit_miss_test.shape)

print('The testing data set is built!') 

# 7. rename the labels column from the sets' y dataframe and extract it (to input model)

y_hit_miss_train.rename(columns={'ctl_miss_hit': 'miss_hit'}, inplace=True)
y_hit_miss_test.rename(columns={'ctl_miss_hit': 'miss_hit'}, inplace=True)

y_hit_miss_labels_train = y_hit_miss_train['miss_hit']
y_hit_miss_labels_test = y_hit_miss_test['miss_hit']


258912
hit       59
missed    15
Name: ctl_miss_hit, dtype: int64
subject y_shape: (74, 3) total y_shape: (74, 3)
number of X filled rows:  74 subject X_shape: (74, 69835) total X_shape: (1928, 69835)
437101
hit       55
missed    23
Name: ctl_miss_hit, dtype: int64
subject y_shape: (78, 3) total y_shape: (152, 3)
number of X filled rows:  152 subject X_shape: (78, 69835) total X_shape: (1928, 69835)
920577
missed    47
hit       31
Name: ctl_miss_hit, dtype: int64
subject y_shape: (78, 3) total y_shape: (230, 3)
number of X filled rows:  230 subject X_shape: (78, 69835) total X_shape: (1928, 69835)
413474
hit       58
missed    20
Name: ctl_miss_hit, dtype: int64
subject y_shape: (78, 3) total y_shape: (308, 3)
number of X filled rows:  308 subject X_shape: (78, 69835) total X_shape: (1928, 69835)
396250
hit       43
missed    35
Name: ctl_miss_hit, dtype: int64
subject y_shape: (78, 3) total y_shape: (386, 3)
number of X filled rows:  386 subject X_shape: (78, 69835) total X_shape: (

number of X filled rows:  1241 subject X_shape: (78, 69835) total X_shape: (1319, 69835)
484204
hit       50
missed    28
Name: ctl_miss_hit, dtype: int64
subject y_shape: (78, 3) total y_shape: (1319, 3)
number of X filled rows:  1319 subject X_shape: (78, 69835) total X_shape: (1319, 69835)
The testing data set is built!


In [99]:

# CORRECT SOURCE VERSUS WRONG SOURCE TRIAL CLASSIFICATION

## Keep only the trials of interest (correct source or wrong source, not miss or control)

# For each set (training and test), create an empty numpy array to store 
# concatenated vectorized beta maps (one row per trial; size = trials * voxels).
# 1. determine the number of rows needed (sum of trials per participants in set)
numrow_train = 0
numrow_test = 0
for sub in cs_ws_train:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_ws_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'wrongsource'])
    y_cs_ws = y_cs_ws_miss_ctl[cs_ws_mask]
    numrow_train = numrow_train + y_cs_ws.shape[0]

for sub in cs_ws_test:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_ws_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'wrongsource'])
    y_cs_ws = y_cs_ws_miss_ctl[cs_ws_mask]
    numrow_test = numrow_test + y_cs_ws.shape[0]   
    
print('number of trials in the training set: ', numrow_train,
     'number of trials in the test set: ', numrow_test)

# 2. determine the number of columns needed (voxels in the vectorized group mask)
masker = NiftiMasker(mask_img=grp_mask_cw, standardize=False)
numvox = masker.fit_transform(grp_mask_cw).shape[1]


# 3. create an empty numpy array to store the data 
X_cs_ws_train = np.empty(shape=(numrow_train, numvox))
X_cs_ws_test = np.empty(shape=(numrow_test, numvox))

print(X_cs_ws_train.shape, X_cs_ws_test.shape)

# 4. create empty dataframes to store trial labels (one per set)
y_cs_ws_train = pd.DataFrame()
y_cs_ws_train.insert(loc = 0, column = 'ctl_miss_ws_cs', value = 'TBD', allow_duplicates=True)
y_cs_ws_train.insert(loc = 1, column = 'dccid', value = 'TBD', allow_duplicates=True)
y_cs_ws_train.insert(loc = 2, column = 'trialnum', value = 'NaN', allow_duplicates=True)

y_cs_ws_test = y_cs_ws_train.copy()

# 5. create a masker object to vectorize the beta maps; 
# one map per trial becomes its own row in X_data matrix
cs_ws_masker = NiftiMasker(mask_img=grp_mask_cw, standardize=True)



number of trials in the training set:  1889 number of trials in the test set:  1965


In [None]:
     
# 6. Fill the X (beta weights per voxel) and y (trial labels) data matrices

# note: nilearn.image.load_img concatenates 3D beta maps in alphabetical order
# trial numbers must be PADDED with zeros to preserve their temporal order when alphabetized

# TRAINING SET
j = 0
for sub in cs_ws_train: 
    print(sub)
    # load labels file as pandas dataframe
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    
    # create a boolean mask to filter out trials of no interest
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_ws_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'wrongsource'])

    # apply mask to labels dataframe to keep only hit and missed trials
    y_cs_ws = y_cs_ws_miss_ctl[cs_ws_mask]
    
    # process and append y data to the set's labels dataframe
    trialnum = y_cs_ws.index
    y_cs_ws.insert(loc = y_cs_ws.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_cs_ws.insert(loc = y_cs_ws.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_cs_ws_train = y_cs_ws_train.append(y_cs_ws, ignore_index=True)
    print(y_cs_ws.ctl_miss_ws_cs.value_counts())
    print('subject y_shape:', y_cs_ws.shape,
         'total y_shape:', y_cs_ws_train.shape)
    
    # load and concatenate beta maps
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    # vectorize beta maps into a 2D numpy array
    sub_trials = cs_ws_masker.fit_transform(betas)
    # mask array to filter out trials of no interest
    cw_sub_trials = sub_trials[cs_ws_mask]
    
    # copy filtered vectorized values into set's X data array
    X_cs_ws_train[j:(j+cw_sub_trials.shape[0]), :] = cw_sub_trials
    j = j + cw_sub_trials.shape[0]
    print('number of X filled rows: ', j,
          'subject X_shape:', cw_sub_trials.shape,
          'total X_shape:', X_cs_ws_train.shape)

print('The training data set is built!') 


# TESTING SET
j = 0
for sub in cs_ws_test: 
    print(sub)
    # load labels file as pandas dataframe
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    
    # create a boolean mask to filter out trials of no interest
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_ws_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'wrongsource'])

    # apply mask to labels dataframe to keep only hit and missed trials
    y_cs_ws = y_cs_ws_miss_ctl[cs_ws_mask]
    
    # process and append y data to the set's labels dataframe
    trialnum = y_cs_ws.index
    y_cs_ws.insert(loc = y_cs_ws.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_cs_ws.insert(loc = y_cs_ws.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_cs_ws_test = y_cs_ws_test.append(y_cs_ws, ignore_index=True)
    print(y_cs_ws.ctl_miss_ws_cs.value_counts())
    print('subject y_shape:', y_cs_ws.shape,
         'total y_shape:', y_cs_ws_test.shape)
    
    # load and concatenate beta maps
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    # vectorize beta maps into a 2D numpy array
    sub_trials = cs_ws_masker.fit_transform(betas)
    # mask array to filter out trials of no interest
    cw_sub_trials = sub_trials[cs_ws_mask]
    
    # copy filtered vectorized values into set's X data array
    X_cs_ws_test[j:(j+cw_sub_trials.shape[0]), :] = cw_sub_trials
    j = j + cw_sub_trials.shape[0]
    print('number of X filled rows: ', j,
          'subject X_shape:', cw_sub_trials.shape,
          'total X_shape:', X_cs_ws_test.shape)

print('The testing data set is built!') 

# 7. rename the labels column from the sets' y dataframe and extract it (to input model)

y_cs_ws_train.rename(columns={'ctl_miss_ws_cs': 'cs_ws'}, inplace=True)
y_cs_ws_test.rename(columns={'ctl_miss_ws_cs': 'cs_ws'}, inplace=True)

y_cs_ws_labels_train = y_cs_ws_train['cs_ws']
y_cs_ws_labels_test = y_cs_ws_test['cs_ws']


In [None]:

# CORRECT SOURCE VERSUS MISS TRIAL CLASSIFICATION

## Keep only the trials of interest (correct source or wrong source, not miss or control)

# For each set (training and test), create an empty numpy array to store 
# concatenated vectorized beta maps (one row per trial; size = trials * voxels).
# 1. determine the number of rows needed (sum of trials per participants in set)
numrow_train = 0
numrow_test = 0
for sub in cs_miss_train:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_miss_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'missed'])
    y_cs_miss = y_cs_ws_miss_ctl[cs_miss_mask]
    numrow_train = numrow_train + y_cs_miss.shape[0]

for sub in cs_miss_test:
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_ws_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'wrongsource'])
    y_cs_ws = y_cs_ws_miss_ctl[cs_ws_mask]
    numrow_test = numrow_test + y_cs_ws.shape[0]   

print('number of trials in the training set: ', numrow_train,
     'number of trials in the test set: ', numrow_test)

# 2. determine the number of columns needed (voxels in the vectorized group mask)
masker = NiftiMasker(mask_img=grp_mask_cmiss, standardize=False)
numvox = masker.fit_transform(grp_mask_cmiss).shape[1]

# 3. create an empty numpy array to store the data 
X_cs_miss_train = np.empty(shape=(numrow_train, numvox))
X_cs_miss_test = np.empty(shape=(numrow_test, numvox))

print(X_cs_miss_train.shape, X_cs_miss_test.shape)

# 4. create empty dataframes to store trial labels (one per set)
y_cs_miss_train = pd.DataFrame()
y_cs_miss_train.insert(loc = 0, column = 'ctl_miss_ws_cs', value = 'TBD', allow_duplicates=True)
y_cs_miss_train.insert(loc = 1, column = 'dccid', value = 'TBD', allow_duplicates=True)
y_cs_miss_train.insert(loc = 2, column = 'trialnum', value = 'NaN', allow_duplicates=True)

y_cs_miss_test = y_cs_miss_train.copy()

# 5. create a masker object to vectorize the beta maps; 
# one map per trial becomes its own row in X_data matrix
cs_miss_masker = NiftiMasker(mask_img=grp_mask_cmiss, standardize=True)


In [None]:

# 6. Fill the X (beta weights per voxel) and y (trial labels) data matrices

# note: nilearn.image.load_img concatenates 3D beta maps in alphabetical order
# trial numbers must be PADDED with zeros to preserve their temporal order when alphabetized

# TRAINING SET
j = 0
for sub in cs_miss_train: 
    print(sub)
    # load labels file as pandas dataframe
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    
    # create a boolean mask to filter out trials of no interest
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_miss_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'missed'])

    # apply mask to labels dataframe to keep only hit and missed trials
    y_cs_miss = y_cs_ws_miss_ctl[cs_miss_mask]
    
    # process and append y data to the set's labels dataframe
    trialnum = y_cs_miss.index
    y_cs_miss.insert(loc = y_cs_miss.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_cs_miss.insert(loc = y_cs_miss.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_cs_miss_train = y_cs_miss_train.append(y_cs_miss, ignore_index=True)
    print(y_cs_miss.ctl_miss_ws_cs.value_counts())
    print('subject y_shape:', y_cs_miss.shape,
         'total y_shape:', y_cs_miss_train.shape)
    
    # load and concatenate beta maps
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    # vectorize beta maps into a 2D numpy array
    sub_trials = cs_miss_masker.fit_transform(betas)
    # mask array to filter out trials of no interest
    cm_sub_trials = sub_trials[cs_miss_mask]
    
    # copy filtered vectorized values into set's X data array
    X_cs_miss_train[j:(j+cm_sub_trials.shape[0]), :] = cm_sub_trials
    j = j + cm_sub_trials.shape[0]
    print('number of X filled rows: ', j,
          'subject X_shape:', cm_sub_trials.shape,
          'total X_shape:', X_cs_miss_train.shape)

print('The training data set is built!') 

    
# TESTING SET
j = 0
for sub in cs_miss_test: 
    print(sub)
    # load labels file as pandas dataframe
    labels_file = os.path.join(label_dir, 'sub-'+str(sub)+'_ctl_miss_ws_cs.tsv')
    y_cs_ws_miss_ctl = pd.read_csv(labels_file, sep='\t')
    
    # create a boolean mask to filter out trials of no interest
    y_cs_ws_miss_ctl_labels = y_cs_ws_miss_ctl['ctl_miss_ws_cs']
    cs_miss_mask = y_cs_ws_miss_ctl_labels.isin(['correctsource', 'missed'])

    # apply mask to labels dataframe to keep only hit and missed trials
    y_cs_miss = y_cs_ws_miss_ctl[cs_miss_mask]
    
    # process and append y data to the set's labels dataframe
    trialnum = y_cs_miss.index
    y_cs_miss.insert(loc = y_cs_miss.shape[1], column = 'dccid', 
                           value = sub, allow_duplicates=True)
    y_cs_miss.insert(loc = y_cs_miss.shape[1], column = 'trialnum', 
                           value = trialnum+1, allow_duplicates=True)
    y_cs_miss_test = y_cs_miss_test.append(y_cs_miss, ignore_index=True)
    print(y_cs_miss.ctl_miss_ws_cs.value_counts())
    print('subject y_shape:', y_cs_miss.shape,
         'total y_shape:', y_cs_miss_test.shape)
    
    # load and concatenate beta maps
    betas = image.load_img(img=os.path.join(beta_dir, str(sub), 'TrialContrasts/betas_sub'+str(sub)+'*.nii'),
                           wildcards=True)
    # vectorize beta maps into a 2D numpy array
    sub_trials = cs_miss_masker.fit_transform(betas)
    # mask array to filter out trials of no interest
    cm_sub_trials = sub_trials[cs_miss_mask]
    
    # copy filtered vectorized values into set's X data array
    X_cs_miss_test[j:(j+cm_sub_trials.shape[0]), :] = cm_sub_trials
    j = j + cm_sub_trials.shape[0]
    print('number of X filled rows: ', j,
          'subject X_shape:', cm_sub_trials.shape,
          'total X_shape:', X_cs_miss_test.shape)

print('The testing data set is built!') 

# 7. rename the labels column from the sets' y dataframe and extract it (to input model)

y_cs_miss_train.rename(columns={'ctl_miss_ws_cs': 'cs_miss'}, inplace=True)
y_cs_miss_test.rename(columns={'ctl_miss_ws_cs': 'cs_miss'}, inplace=True)

y_cs_miss_labels_train = y_cs_miss_train['cs_miss']
y_cs_miss_labels_test = y_cs_miss_test['cs_miss']



Step 5. Create and trian the Support Vector Classififer model!!

Support documentation:
https://nilearn.github.io/decoding/estimator_choice.html
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm
Note: LinearSVC scales better than SVC to large datasets, equivalent to SVC with linear kernel
https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

Neurolibre tutorial (MAIN 2018):
https://brainhack101.github.io/introML-book/01/MAIN_tutorial_machine_learning_with_nilearn

Example with Haxby dataset (within subject):
https://nilearn.github.io/auto_examples/plot_decoding_tutorial.html#retrieve-and-load-the-fmri-data-from-the-haxby-study


In [15]:
# Data to input the Support Vector Machine model for different classifications: 

# ENCODING vs CONTROL condition: 
# Training: X_enc_ctl_train, y_enco_ctl_labels_train
# Testing: X_enc_ctl_test, y_enco_ctl_labels_test

# HIT vs MISS trials: 
# Training: X_hit_miss_train, y_hit_miss_labels_train
# Testing: X_hit_miss_test, y_hit_miss_labels_test

# CORRECT SOURCE vs WRONG SOURCE trials:
# Training: X_cs_ws_train, y_cs_ws_labels_train
# Testing: X_cs_ws_test, y_cs_ws_labels_test

# CORRECT SOURCE vs MISS trials: 
# Training: X_cs_miss_train, y_cs_miss_labels_train
# Testing: X_cs_miss_test, y_cs_miss_labels_test

X_train = X_enc_ctl_train
y_train = y_enco_ctl_labels_train
X_test = X_enc_ctl_test
y_test = y_enco_ctl_labels_test

# initialise the SVC model
# Note that class_weight gives equivalent influence to different categories
# important if number of trials differs per condition
#trial_svc = SVC(kernel='linear', class_weight='balanced') #define the model
trial_svc = LinearSVC(class_weight='balanced') #define the model

print(trial_svc)




LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [16]:
# Set up 10-fold cross-validation to evaluate the model's performance over the training set
# predict
y_pred = cross_val_predict(trial_svc, X_train, y_train,
                           groups=y_train, cv=7)
# scores
acc = cross_val_score(trial_svc, X_train, y_train,
                     groups=y_train, cv=7)

#Look at accuracy of prediction for each fold of the cross-validation
for i in range(7):
    print('Fold %s -- Acc = %s'%(i, acc[i]))



KeyboardInterrupt: 

In [None]:

#look at the overall accuracy of the model (over training data set)

overall_acc = accuracy_score(y_pred = y_pred, y_true = y_train)
overall_cr = classification_report(y_pred = y_pred, y_true = y_train)
overall_cm = confusion_matrix(y_pred = y_pred, y_true = y_train)
print('Accuracy: ',overall_acc)
print(overall_cr)

thresh = overall_cm.max() / 2
cmdf = pd.DataFrame(overall_cm, index = ['Control','Encoding'], columns = ['Control','Encoding'])
sns.heatmap(cmdf, cmap='RdBu_r')
plt.xlabel('Predicted')
plt.ylabel('Observed')
for i, j in itertools.product(range(overall_cm.shape[0]), range(overall_cm.shape[1])):
        plt.text(j+0.5, i+0.5, format(overall_cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white")


