In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import feature_spaces
import pandas as pd
from sklearn.preprocessing import StandardScaler
from typing import List
from matplotlib import pyplot as plt
from os.path import join
import numpy as np
import encoding_utils, feature_spaces
import pickle as pkl
from collections import defaultdict
from datasets import Dataset, DatasetDict
from feature_spaces import *
NUM_VOXELS = 250

# Save dataset in standard format (csv)
- 'text': Last 20 words as input text
- 'vox1'...'vox250': regression response for each voxel
- 'corr_test': correlation between predicted and actual response

In [None]:
# Look at narrative stories
train_stories, test_stories, allstories = encoding_utils.get_allstories([1, 2, 3, 4, 5])
wordseqs = feature_spaces.get_story_wordseqs(allstories)

In [None]:
def get_words_for_story(wordseq) -> List[str]:
    running_words = []
    
    words = np.array(wordseq.data)
    TRIM = 5
    tr_times = wordseq.tr_times[5+TRIM: -TRIM]
    for i in range(len(tr_times)):
        tr_time = tr_times[max(0, i - 1)]
        # tr_time = tr_times[i]
        valid_times = wordseq.data_times <= tr_time
        # print(valid_times)
        running_words.append(' '.join(words[valid_times][-30:]))
    #     print(tr_time, running_words)
    return running_words

texts_list_train = []
for story_name in tqdm(train_stories):
    wordseq = wordseqs[story_name]
    texts_list_train.append(get_words_for_story(wordseq))
texts_train = sum(texts_list_train, [])

texts_list_test = []
for story_name in tqdm(test_stories):
    wordseq = wordseqs[story_name]
    texts_list_test.append(get_words_for_story(wordseq))
texts_test = sum(texts_list_test, [])

In [None]:
subj = 'UTS03'

# select top_idxs
save_dir = '/home/chansingh/mntv1/deep-fMRI/results/encoding/bert-10__ndel=4/UTS03'
corrs_val = np.load(join(save_dir, 'corrs.npz'))['arr_0']
top_idxs = np.argsort(corrs_val)[::-1][:NUM_VOXELS]
print(corrs_val[top_idxs][:5])

In [None]:
# load responses (n_time_points x n_voxels)
resp_train = encoding_utils.get_response(train_stories, subj)
resp_test = encoding_utils.get_response(test_stories, subj)
print(f"{resp_train.shape=}, {resp_test.shape=}")

# select top voxels
resp_train_voxel = resp_train[:, top_idxs]
resp_test_voxel = resp_test[:, top_idxs]
print(f"{resp_train_voxel.shape=}, {resp_test_voxel.shape=}")

In [None]:
resp_train_voxel = StandardScaler().fit_transform(resp_train_voxel)
resp_test_voxel = StandardScaler().fit_transform(resp_test_voxel)

In [None]:
# test a basic linear model on the top few voxels
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.feature_extraction.text import CountVectorizer

for i in tqdm(range(10)):
    m = RidgeCV()
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform([x for x in texts_train])
    X_test = vectorizer.transform([x for x in texts_test])
    m.fit(X_train, resp_train_voxel[:, i])
    preds = m.predict(X_test)
    score = np.corrcoef(preds, resp_test_voxel[:, i])[0, 1]
    print(i, f"{score=}")

In [None]:
df_train = {
    'text': texts_train,
}
df_test = {
    'text': texts_test,
}
for i in range(NUM_VOXELS):
    df_train[f'voxel_{i}'] = resp_train_voxel[:, i]
    df_test[f'voxel_{i}'] = resp_test_voxel[:, i]

In [None]:
dset_train = Dataset.from_pandas(pd.DataFrame.from_dict(df_train))
dset_test = Dataset.from_pandas(pd.DataFrame.from_dict(df_test))
ds = DatasetDict()
ds['train'] = dset_train
ds['test'] = dset_test
print(ds)

In [None]:
ds.push_to_hub('csinva/fmri_language_responses')