In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import dvu
import seaborn as sns
import os
import pandas as pd
from copy import deepcopy
from matplotlib import pyplot as plt
from os.path import join
import numpy as np
import neuro.features.feature_utils
import joblib
import neuro.data.story_names
import neuro.data.response_utils
from tqdm import tqdm
import neuro.features.feature_spaces
from himalaya.ridge import RidgeCV
from sklearn.model_selection import check_cv
from himalaya.backend import set_backend
import neuro.config
from tabpfn import TabPFNRegressor
from collections import defaultdict

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
subject = 'UTS03'

In [None]:
story_names_train = neuro.data.story_names.get_story_names(
    subject=subject, train_or_test='train')
story_names_test = neuro.data.story_names.get_story_names(
    # could set use_huge=True here and below....
    subject=subject, train_or_test='test')

In [None]:
class A:
    feature_space = 'eng1000'


# load feats, already trimmed and normalized
args = A()
# feats_train = neuro.features.feature_utils.get_features_full(
#     args=args,
#     feature_space=args.feature_space,
#     qa_embedding_model=None,
#     story_names=story_names_train,
#     use_added_delays=False,
# )
feats_test = neuro.features.feature_utils.get_features_full(
    args=args,
    feature_space=args.feature_space,
    qa_embedding_model=None,
    story_names=story_names_test,
    use_added_delays=False,
)

print('feat shapes', feats_train.shape, feats_test.shape)
print('resp shapes', resps_train.shape, resps_test.shape)

In [None]:
# subsample output voxels and feature inputs
rng = np.random.default_rng(0)

corrs_test = joblib.load(join(neuro.config.PROCESSED_DIR, subject.replace(
    'UT', ''), 'corrs_test_35.pkl')).values[0]
# find indices of voxels in top-2000 voxels and randomly sample n_voxels
inds_top = np.argsort(corrs_test)[-1000:]
random_voxels = rng.choice(
    inds_top, size=n_voxels, replace=False
)
resps_train_subsampled_voxels = resps_train[:, random_voxels].astype(
    np.float32)
resps_test_subsampled_voxels = resps_test[:, random_voxels].astype(np.float32)

feats_train_subsampled = feats_train[:, :n_feats].astype(np.float32)
feats_test_subsampled = feats_test[:, :n_feats].astype(np.float32)

print('feat shapes', feats_train_subsampled.shape, feats_test_subsampled.shape)
print('resp shapes', resps_train_subsampled_voxels.shape,
      resps_test_subsampled_voxels.shape)

In [None]:
# set up cv splitting
n_samples_train = feats_train_subsampled.shape[0]
chunk_len = 40
chunk_starts = np.arange(0, n_samples_train, chunk_len)
cv = generate_leave_one_run_out(n_samples_train, chunk_starts)
cv = check_cv(cv)  # copy the cross-validation splitter into a reusable list

In [None]:
model = RidgeCV(alphas=[1, 10, 100], cv=cv)
# add delays for ridge
feats_train_subsampled_delayed = neuro.features.feature_utils.make_delayed(
    feats_train_subsampled, range(1, n_delays_ridge + 1))
model.fit(feats_train_subsampled, resps_train_subsampled_voxels)
preds_train = model.predict(feats_train_subsampled)
preds_test = model.predict(feats_test_subsampled)


def get_corrs(preds, resps):
    return [np.corrcoef(preds[:, i], resps[:, i])[0, 1] for i in range(preds.shape[1])]


r = defaultdict(list)
r['corrs_train_ridge'] = get_corrs(preds_train, resps_train_subsampled_voxels)
r['corrs_test_ridge'] = get_corrs(preds_test, resps_test_subsampled_voxels)
print('train', np.mean(r['corrs_train_ridge']))
print('test', np.mean(r['corrs_test_ridge']))

In [None]:
def get_time_cols(n):

    t = np.arange(n) / n
    vals = [t]
    for period in np.logspace(0, 3, 5):
        vals.append(np.sin(t * period * 2 * np.pi))
        vals.append(np.cos(t * period * 2 * np.pi))
    return np.stack(vals, axis=1)


time_cols = get_time_cols(
    feats_train_subsampled.shape[0] + feats_test_subsampled.shape[0])
feats_train_subsampled_with_time = np.concatenate(
    [feats_train_subsampled, time_cols[:feats_train_subsampled.shape[0]]], axis=1)
feats_test_subsampled_with_time = np.concatenate(
    [feats_test_subsampled, time_cols[feats_train_subsampled.shape[0]:]], axis=1)

In [None]:
for i in tqdm(range(preds_train.shape[1])):
    model = TabPFNRegressor(device='cuda:1')
    model.fit(feats_train_subsampled, preds_train[:, i])
    preds_test = model.predict(feats_test_subsampled)
    r['corrs_test_tabpfn'].append(np.corrcoef(
        preds_test, resps_test_subsampled_voxels[:, i])[0, 1])

    # redo with time feats
    model = TabPFNRegressor(device='cuda:1')
    model.fit(feats_train_subsampled_with_time, preds_train[:, i])
    preds_test = model.predict(feats_test_subsampled_with_time)
    r['corrs_test_tabpfn_time'].append(np.corrcoef(
        preds_test, resps_test_subsampled_voxels[:, i])[0, 1])

    print(
        f'voxel {i} test {r["corrs_test_ridge"][i]:.3f} -> {r["corrs_test_tabpfn"][-1]:.3f} -> {r["corrs_test_tabpfn_time"][-1]:.3f}')
    print(
        '\tavg cum. improvement',
        np.mean(r['corrs_test_tabpfn'] -
                np.mean(r['corrs_test_ridge'][:len(r['corrs_test_tabpfn'])])),
        np.mean(r['corrs_test_tabpfn_time'] -
                np.mean(r['corrs_test_ridge'][:len(r['corrs_test_tabpfn_time'])]))
    )