As a comparison to the subject to subject fitting, try using subject X trial 1 (for 1000 shared images) to predict subject X trial 2 etc. (should hopefully be similar to the split-half reliability?

In [None]:
#Note on notebook: this did not work (i.e. regression performance way lower than expected). Not totally sure why. 
#Used it to save out organized split half vals instead

In [9]:
import argparse
import pandas as pd
import numpy as np
import scipy.io
import nibabel.freesurfer.mghformat as mgh
import h5py
import os
from sklearn.cross_decomposition import PLSRegression
import scipy.stats as stats
import pickle

In [2]:
oakzfs_stem = '/sni-storage/kalanit/Projects/Dawn/NSD/'

UTILS_PATH = oakzfs_stem + 'code/fit_pipeline/utils/'
BETA_PATH = oakzfs_stem + 'local_data/processed/organized_betas/'
STIM_PATH = oakzfs_stem + 'data/nsddata_stimuli/stimuli/nsd/'
NSDDATA_PATH = oakzfs_stem + 'data/nsddata/'
FS_PATH = oakzfs_stem + 'local_data/freesurfer/'
FEATS_PATH = oakzfs_stem + 'code/fit_pipeline/models/features/'
RESULTS_PATH = oakzfs_stem + 'results/'

In [3]:
import sys
sys.path.append(UTILS_PATH)

from rsm_utils import get_ROI_data
import regression_utils as rutils

In [4]:
# how many times each image was shown
N_REPEATS = 3

# layer names for different models
ALEXNET_LAYERS = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6', 'fc7']

In [5]:
subj = "02"
hemi = "rh"
roi = "streams_shrink10"
sub_roi = "ventral"
sub_roix = 5

In [6]:
def get_indices(subj):
    
    order = scipy.io.loadmat(BETA_PATH + 'datab3nativesurface_subj' + subj)
    data = pd.read_csv(NSDDATA_PATH + 'ppdata/subj' + subj + '/behav/responses.tsv', sep='\t')
    expdesign = scipy.io.loadmat(NSDDATA_PATH + 'experiments/nsd/nsd_expdesign.mat')
    
    #73KIDs
    all_ids = np.array(data['73KID'])
    vals, idx_start, count = np.unique(all_ids, return_counts=True, return_index=True)
    which_reps = vals[count == N_REPEATS]
    mask_3reps = np.isin(all_ids,which_reps)
    id_nums_3reps = np.array(data['73KID'])[mask_3reps]
    rep_vals = np.unique(id_nums_3reps) #sorted version of beta order
    
    #how the betas are ordered (using COCO 73K id numbers)
    beta_order_in_73Kids = all_ids[order['allixs'][0]-1]-1 #-1 to convert from matlab to python indexing
    
    # shared (i.e. validation) IDS (but include all potential shared reps for the subj, not min across subjs)
    sharedix = expdesign['sharedix'][0]
    validation_mask = np.isin(rep_vals,sharedix)

    return beta_order_in_73Kids, validation_mask

In [10]:
with open(oakzfs_stem + 'local_data/processed/' + 'rh_betas_by_repeat_by_ROI_zscore_1000.data', 'rb') as filehandle:
    # read the data as binary data stream
    rh_betas_by_repeat_by_ROI = pickle.load(filehandle)

In [11]:
subj02_rh_betas = rh_betas_by_repeat_by_ROI[1] #only subj2 (btw only includes subjs 1,2,5,7)

In [85]:
split_half =  scipy.io.loadmat(FS_PATH + 'subj' + subj + '/rh_split_half.mat')

In [86]:
split_half['mean'].shape

(1, 239309)

In [87]:
#let's just load and reorganize all subjects split-half calcs by ROI while we're at it
subjid = ['01', '02', '03', '04', '05', '06', '07', '08']
ROI_names = ['Unknown', 'Early', 'Midventral', 'Midlateral', 'Midparietal', 'Ventral', 'Lateral', 'Parietal']

split_half_by_subj = []
for sidx, sid in enumerate(subjid):
    sh =  scipy.io.loadmat(FS_PATH + 'subj' + sid + '/' + hemi + '_split_half.mat')
    split_half_by_subj.append(sh['mean'])
    
streams = []
for sidx, sid in enumerate(subjid):
    mgh_file = mgh.load(FS_PATH + '/subj' + sid + '/' + hemi + '.streams.mgz')
    streams.append(mgh_file.get_fdata()[:,0,0])
    
#organize split-half by ROI
split_half_by_ROI = [[[] for j in range(len(ROI_names)-1)] for i in range(len(subjid))]

#two loops because otherwise we run out of mems
for sidx, sid in enumerate(subjid):  
    for roi_idx in range(len(ROI_names)-1):       
        split_half_by_ROI[sidx][roi_idx] = split_half_by_subj[sidx][:,streams[sidx] == roi_idx+1]


In [194]:
# save for future use
with open(RESULTS_PATH + 'rh_split_half_by_ROI.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(split_half_by_ROI, filehandle)

In [195]:
rh_split_half_by_ROI = split_half_by_ROI

In [88]:
split_half_by_subj[1].shape

(1, 239309)

In [89]:
subj02_sh_by_ROI = split_half_by_ROI[1] #for this analysis

In [115]:
ventral_mask = subj02_sh_by_ROI[4]>.1
lateral_mask = subj02_sh_by_ROI[5]>.1
parietal_mask = subj02_sh_by_ROI[6]>.1

In [126]:
ventral_trim = np.zeros((N_REPEATS, 1000, np.sum(ventral_mask[0])))
ventral_trim[0,:,:] = subj02_rh_betas[4][0][:,ventral_mask[0]]
ventral_trim[1,:,:] = subj02_rh_betas[4][1][:,ventral_mask[0]]
ventral_trim[2,:,:] = subj02_rh_betas[4][2][:,ventral_mask[0]]

In [127]:
ventral_trim.shape

(3, 1000, 9051)

In [128]:
#all splits for regression training
num_splits = 1 #average over two random splits
all_splits = rutils.get_splits(data=ventral_trim,
                               split_index=1,
                               num_splits=num_splits,
                               num_per_class_test = 200,
                               num_per_class_train = 800)

In [183]:
all_resdict = {}

t1 = [0, 1, 2]
t2 = [1, 2, 0]

for s in range(N_REPEATS):   #for trial comb ...
    
    print('evaluating %s' % s)
    feats = ventral_trim[s,:,:]

    res = rutils.train_and_test_scikit_regressor(features=feats, 
                                                    labels=ventral_trim[t2[s],:,:],
                                                    splits=all_splits,
                                                    model_class=PLSRegression,
                                                    model_args={'n_components': 5, #to match
                                                                'scale': False},
                                                    feature_norm=False)
            
    all_resdict[s] = res

evaluating 0
evaluating 1
evaluating 2


In [184]:
rsquared_array = {}
for s in range(N_REPEATS):
    rsquared_array[s] = all_resdict[s]['test']['mean_rsquared_array']

In [187]:
np.mean(rsquared_array[2])

0.10028541319595886

In [140]:
ventral_trim[c2[s],:,:].shape

(1000, 9051)

In [143]:
np.mean(subj02_sh_by_ROI[4][ventral_mask])

0.224239060532011

In [145]:
c2[s]

0

In [147]:
n_vox = np.sum(ventral_mask[0])

#calculate split-half reliability
corrvals = np.zeros((n_vox,3))
for vox in range(n_vox):
    for r in range(3):
        corrval = stats.pearsonr(ventral_trim[t1[r],:,vox],
                                ventral_trim[t2[r],:,vox])[0]
        corrvals[vox, r] = corrval

In [151]:
subset_sh = np.mean(corrvals,axis=1)

In [152]:
subset_sh

array([0.15812837, 0.21005189, 0.2294373 , ..., 0.10776668, 0.0785798 ,
       0.08121873])

In [153]:
subj02_sh_by_ROI[4][ventral_mask]

array([0.17599837, 0.20001384, 0.20306887, ..., 0.11013285, 0.10445924,
       0.1058643 ])

In [154]:
stats.pearsonr(subset_sh,subj02_sh_by_ROI[4][ventral_mask])[0] #split half calc using all images vs shared 1000

0.9802131299290657

In [190]:
rsquared_array[2]

array([-0.00654457,  0.05105688,  0.02149402, ...,  0.0902879 ,
        0.10893416,  0.08424819])

In [191]:
pwd()

'/share/kalanit/biac2/kgs/projects/Dawn/NSD/code/fit_pipeline/notebooks'

In [196]:
hemi = "lh"
split_half_by_subj = []
for sidx, sid in enumerate(subjid):
    sh =  scipy.io.loadmat(FS_PATH + 'subj' + sid + '/' + hemi + '_split_half.mat')
    split_half_by_subj.append(sh['mean'])
    
streams = []
for sidx, sid in enumerate(subjid):
    mgh_file = mgh.load(FS_PATH + '/subj' + sid + '/' + hemi + '.streams.mgz')
    streams.append(mgh_file.get_fdata()[:,0,0])
    
#organize split-half by ROI
split_half_by_ROI = [[[] for j in range(len(ROI_names)-1)] for i in range(len(subjid))]

#two loops because otherwise we run out of mems
for sidx, sid in enumerate(subjid):  
    for roi_idx in range(len(ROI_names)-1):       
        split_half_by_ROI[sidx][roi_idx] = split_half_by_subj[sidx][:,streams[sidx] == roi_idx+1]

In [197]:
# save for future use
with open(RESULTS_PATH + 'lh_split_half_by_ROI.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(split_half_by_ROI, filehandle)