In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import feature_spaces
import pandas as pd
from sklearn.preprocessing import StandardScaler
from typing import List
from matplotlib import pyplot as plt
from os.path import join
import numpy as np
import encoding_utils, feature_spaces
import pickle as pkl
from collections import defaultdict
from datasets import Dataset, DatasetDict
from feature_spaces import *
NUM_VOXELS = 250

# Save dataset in standard format (csv)
- 'text': Last 20 words as input text
- 'vox1'...'vox250': regression response for each voxel
- 'corr_test': correlation between predicted and actual response

In [2]:
# Look at narrative stories
train_stories, test_stories, allstories = encoding_utils.get_allstories([1, 2, 3, 4, 5])
wordseqs = feature_spaces.get_story_wordseqs(allstories)

In [77]:
def get_words_for_story(wordseq, max_running_words=30) -> List[str]:
    running_words = []
    
    words = np.array(wordseq.data)
    TRIM = 5
    tr_times = wordseq.tr_times[5+TRIM: -TRIM]
    for i, tr_time in enumerate(tr_times):
        valid_times = wordseq.data_times <= tr_time
        # print(valid_times)
        running_words.append(' '.join(words[valid_times][-max_running_words:]))
    #     print(tr_time, running_words)
    return running_words

texts_list_train = []
for story_name in tqdm(train_stories):
    wordseq = wordseqs[story_name]
    texts_list_train.append(get_words_for_story(wordseq))
texts_train = sum(texts_list_train, [])

texts_list_test = []
for story_name in tqdm(test_stories):
    wordseq = wordseqs[story_name]
    texts_list_test.append(get_words_for_story(wordseq))
texts_test = sum(texts_list_test, [])

100%|██████████| 26/26 [00:00<00:00, 170.47it/s]
100%|██████████| 1/1 [00:00<00:00, 203.65it/s]


In [4]:
subj = 'UTS03'

# select top_idxs
save_dir = '/home/chansingh/mntv1/deep-fMRI/results/encoding/bert-10__ndel=4/UTS03'
corrs_val = np.load(join(save_dir, 'corrs.npz'))['arr_0']
top_idxs = np.argsort(corrs_val)[::-1][:NUM_VOXELS]
print(corrs_val[top_idxs][:5])

[0.76296981 0.74258237 0.72107898 0.71616266 0.71508206]


In [None]:
# load responses (n_time_points x n_voxels)
resp_train = encoding_utils.get_response(train_stories, subj)
resp_test = encoding_utils.get_response(test_stories, subj)
print(f"{resp_train.shape=}, {resp_test.shape=}")

# select top voxels
resp_train_voxel = resp_train[:, top_idxs]
resp_test_voxel = resp_test[:, top_idxs]
print(f"{resp_train_voxel.shape=}, {resp_test_voxel.shape=}")

[0.76296981 0.74258237 0.72107898 0.71616266 0.71508206]
resp_train_voxel.shape=(9461, 250), resp_test_voxel.shape=(291, 250)


In [91]:
resp_train_voxel = StandardScaler().fit_transform(resp_train_voxel)
resp_test_voxel = StandardScaler().fit_transform(resp_test_voxel)

In [94]:
df_train = {
    'text': texts_train,
}
df_test = {
    'text': texts_test,
}
for i in range(NUM_VOXELS):
    df_train[f'voxel_{i}'] = resp_train_voxel[:, i]
    df_test[f'voxel_{i}'] = resp_test_voxel[:, i]

In [None]:
dset_train = Dataset.from_pandas(pd.DataFrame.from_dict(df_train))
dset_test = Dataset.from_pandas(pd.DataFrame.from_dict(df_test))
ds = DatasetDict()
ds['train'] = dset_train
ds['test'] = dset_test
print(ds)

In [None]:
ds.push_to_hub('csinva/fmri_language_responses')

# Visualize top voxels and which ROIs they are in

In [5]:
import cortex
viz_cortex = __import__('03_viz_cortex')

In [None]:
# visualize after masking anything that isn't in the top_idxs
corrs = corrs_val

for i in range(len(corrs)):
    if i not in top_idxs:
        corrs[i] = 0
viz_cortex.quickshow(corrs)

In [None]:
subject = "UTS03"
xfm = "UTS03_auto"

# Get the map of which voxels are inside of our ROI
index_volume, index_keys = cortex.utils.get_roi_masks(
    subject, xfm,
    roi_list=None, # Default (None) gives all available ROIs in overlays.svg
    gm_sampler='cortical-conservative', # Select only voxels mostly within cortex
    split_lr=True, # Separate left/right ROIs (this occurs anyway with index volumes)
    threshold=0.9, # convert probability values to boolean mask for each ROI
    return_dict=False # return index volume, not dict of masks
)

# Count how many of the top_voxels are in each ROI
roi_counter = defaultdict(list)
for k in tqdm(index_keys):
    roi_verts = cortex.get_roi_verts('UTS03', k)
    for val in roi_verts.values():
        for top_idx in top_idxs:
            if top_idx in val:
                roi_counter[k].append(top_idx)
counts = {k: len(v) for k, v in roi_counter.items()}
sorted(counts.items(), key=lambda s: s[1], reverse=True)