# Loosely to Very Relevant Preprocessing

In [19]:
# packages i might use
import os
from sklearn.linear_model import LogisticRegression
import numpy as np
from nilearn import image
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from scipy import stats
import pandas as pd

In [16]:
## get list of subject directories
proj_dir = '/home/jefan/sketchloop02/'
contents_dir = os.listdir(proj_dir)

sub_dirs = []
for i in contents_dir:
    try:
        if i.split('_')[1]=='neurosketch':
            sub_dirs.append(i)
    except:
        pass

sub_dirs = sorted(sub_dirs)

# issue with 1207161
sub_dirs = [s for s in sub_dirs if s != '1207161_neurosketch']

# issue with 1201161
sub_dirs = [s for s in sub_dirs if s != '1201161_neurosketch']

print(sub_dirs)
print(str(len(sub_dirs)) + ' subjects')

['0110171_neurosketch', '0110172_neurosketch', '0111171_neurosketch', '0112171_neurosketch', '0112172_neurosketch', '0112173_neurosketch', '0113171_neurosketch', '0115172_neurosketch', '0115174_neurosketch', '0117171_neurosketch', '0118171_neurosketch', '0118172_neurosketch', '0119171_neurosketch', '0119172_neurosketch', '0119173_neurosketch', '0119174_neurosketch', '0120171_neurosketch', '0120172_neurosketch', '0120173_neurosketch', '0123171_neurosketch', '0123173_neurosketch', '0124171_neurosketch', '0125171_neurosketch', '0125172_neurosketch', '1121161_neurosketch', '1130161_neurosketch', '1202161_neurosketch', '1203161_neurosketch', '1206161_neurosketch', '1206162_neurosketch', '1206163_neurosketch', '1207162_neurosketch']
32 subjects


In [17]:
# analysis helper functions i might use
def get_mask_array(mask_path):
    # loads mask applied to nifty (.nii.gz) file
    # mask selects voxels to be included/discarded
    mask_img = image.load_img(mask_path)
    mask_data = mask_img.get_data()
    num_brain_voxels = sum(sum(sum(mask_data==1)))
    return mask_data, num_brain_voxels

def load_roi_mask_combined(subj,run_num,roi):
    if run_num in [1,2]:
        phase_num = '12' 
    elif run_num in [3,4]:
        phase_num = '34'
    elif run_num in [5,6]:
        phase_num = '56'
    mask_path = proj_dir + '/' + subj +'/analysis/firstlevel/rois/' + roi + '_func_combined_' + phase_num + '_binarized.nii.gz'        
    mask_data, nv = get_mask_array(mask_path)
    return mask_data

# Collect Feature Set

We want 160 x num_voxels feature array saved as .npy for each subject, focused on occitemp.

In [21]:
ROIs = ['occitemp']

for roi in ROIs:
    print(roi)
    
    for subject in sub_dirs:
        print(subject)
        
        subj = [subject[:7]] * 160
        label = []
        run_num = [1]*80 + [2]*80
        TR_num = []
        features = []
        
        roi_mask = load_roi_mask_combined(subject, 1, roi)
        for run in [1, 2]:
            
            # load subject's time series for this run
            timeseries = image.load_img(proj_dir + subject + '/analysis/firstlevel/preproc_recognition_run_' +
                                                 str(run) + '.feat/filtered_func_data.nii.gz')
            timeseries = timeseries.get_data().transpose((3, 0, 1, 2))
            
            # use information in regressor/run_x folder to make hasImage vector
            # associated TR is just the hasImage index, converted to a float if needed
            hasImage = [0]*240
            for cope in ['bed', 'bench', 'chair', 'table']:
                with open('/home/jgunn/neurosketch/timepoints/' + subject[:7] + '_' + str(run) + '_' + cope + '.txt') as f:
                    times = [line.split(' ')[0] for line in f.read().split('\n')[:-1]]
                    for t in times:
                        tr = float(t)/1.5
                        if cope == 'bed':
                            hasImage[int(tr)] = 1
                        elif cope == 'bench':
                            hasImage[int(tr)] = 2
                        elif cope == 'chair':
                            hasImage[int(tr)] = 3
                        elif cope == 'table':
                            hasImage[int(tr)] = 4
            
            # wherever hasImage, we want the features
            for i, has in enumerate(hasImage): # 80 times
                if has != 0:
                    features.append(timeseries[i+3][roi_mask==1])
                    
            # gotta track label and TR_num
            for index, value in enumerate(hasImage):
                if value != 0:
                    TR_num.append(float(index))
                    if value == 1:
                        label.append('bed')
                    elif value == 2:
                        label.append('bench')
                    elif value == 3:
                        label.append('chair')
                    elif value == 4:
                        label.append('table')
        
        features = np.stack(features,axis=1)
        np.save(file='/home/jgunn/neurosketch/matrices/' + subject[:7] + '_featurematrix.npy', arr=features)
        
        x = pd.DataFrame([subj, label, run_num, TR_num]) # where each of those variables are lists of the same length
        x = x.transpose()
        x.columns = ['subj','label','run_num', 'TR_num']
        x.to_csv('/home/jgunn/neurosketch/matrices/metadata_{}.csv'.format(subject[:7])) # in a notebook, you can look at x by evaluating it by itself in its own cell.

occitemp
0110171_neurosketch
0110172_neurosketch
0111171_neurosketch
0112171_neurosketch
0112172_neurosketch
0112173_neurosketch
0113171_neurosketch
0115172_neurosketch
0115174_neurosketch
0117171_neurosketch
0118171_neurosketch
0118172_neurosketch
0119171_neurosketch
0119172_neurosketch
0119173_neurosketch
0119174_neurosketch
0120171_neurosketch
0120172_neurosketch
0120173_neurosketch
0123171_neurosketch
0123173_neurosketch
0124171_neurosketch
0125171_neurosketch
0125172_neurosketch
1121161_neurosketch
1130161_neurosketch
1202161_neurosketch
1203161_neurosketch
1206161_neurosketch
1206162_neurosketch
1206163_neurosketch
1207162_neurosketch


In [None]:
# train on run 1, test on run 2 using SVM or 4-way softmax (logistic regression classifier)
# how to define training set?
# get the 80 cope maps for run 1 per subject
# i'll do it for the whole brain and then with ROI masks applied, compare results

# this time, we only pick one instead of (presumably at +3, maybe +2) 
ROIs = ['V1','fusiform','IT','LOC', 'occitemp']
roiscores = []

for roi in ROIs:
    print(roi)
    actual = []
    predicted = []
    for subject in sub_dirs:
        print(subject)
        trainingX = []
        trainingY = []
        testX = []
        testY = []
        roi_mask = load_roi_mask_combined(subject, 1, roi)
        for run in [1, 2]:

            # load subject's time series for this run
            timeseries = image.load_img(proj_dir + subject + '/analysis/firstlevel/preproc_recognition_run_' +
                                                 str(run) + '.feat/filtered_func_data.nii.gz')
            timeseries = timeseries.get_data().transpose((3, 0, 1, 2))
            #timeseries = stats.zscore(timeseries, axis=0)
            #timeseries[np.isnan(timeseries)] = 0
            
            # use information in regressor/run_x folder to make hasImage vector
            hasImage = [0]*240
            for cope in ['bed', 'bench', 'chair', 'table']:
                with open('/home/jgunn/neurosketch/timepoints/' + subject[:7] + '_' + str(run) + '_' + cope + '.txt') as f:
                    times = [line.split(' ')[0] for line in f.read().split('\n')[:-1]]
                    for t in times:
                        tr = float(t)/1.5
                        if cope == 'bed':
                            hasImage[int(tr)] = 1
                        elif cope == 'bench':
                            hasImage[int(tr)] = 2
                        elif cope == 'chair':
                            hasImage[int(tr)] = 3
                        elif cope == 'table':
                            hasImage[int(tr)] = 4

            # wherever hasImage, get associated volume and flatten it for training
            for i, has in enumerate(hasImage): # 80 times
                if has > 0:
                    if run == 1:
                        trainingX.append(timeseries[i+3][roi_mask==1])
                        trainingY.append(has-1)
                    else:
                        testX.append(timeseries[i+3][roi_mask==1])
                        testY.append(has-1)

        lin_clf = LogisticRegression()
        lin_clf.fit(trainingX, trainingY)
        predicted = predicted + lin_clf.predict(testX).tolist()
        actual = actual + testY
        #scores = scores + [np.mean(cross_val_score(lin_clf, trainingX, trainingY).tolist())]
    roiscores.append(np.mean(np.equal(actual, predicted)))
    
roiscores