In [21]:
%load_ext autoreload
%autoreload 2

import os
import sys
import random

import numpy as np
import torch
import torch.nn.functional as F
import h5py
import h5py
import pandas as pd
import nibabel as nib
from pathlib import Path
from einops import rearrange

dir2 = os.path.abspath('../..')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: 
    sys.path.append(dir1)
    
from research.data.natural_scenes import NaturalScenesDataset
from research.metrics.metrics import compute_ncsnr_fast, compute_nc


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
dataset_path = Path('D:\\Datasets\\NSD\\')

derivatives_path = dataset_path / 'derivatives'
betas_path = dataset_path / 'nsddata_betas' / 'ppdata'
ppdata_path = dataset_path / 'nsddata' / 'ppdata'

In [4]:
subjects = {f'subj0{i}': {} for i in range(1, 9)}

for subject_name, subject_data in subjects.items():
    responses_file_path = ppdata_path / subject_name / 'behav' / 'responses.tsv'
    subject_data['responses'] = pd.read_csv(responses_file_path, sep='\t',)
    
    # The last 3 sessions are currently held-out for the algonauts challenge
    # remove them for now.
    session_ids = subject_data['responses']['SESSION']
    held_out_mask = session_ids > (np.max(session_ids) - 3)
    subject_data['responses'] = subject_data['responses'][~held_out_mask]

    subject_sessions_path = betas_path / subject_name / 'func1pt8mm' / 'betas_fithrf_GLMdenoise_RR'
    num_sessions = np.max(subject_data['responses']['SESSION'])

    subject_data['sessions'] = [
        h5py.File(subject_sessions_path / f'betas_session{i:02}.hdf5', 'r')
        for i in range(1, num_sessions + 1)
    ]




In [5]:
# Concatenate all of the betas_sessions together into a single file (this will take a while)

for subject_name, subject_data in subjects.items():
    print(subject_name)
    path = derivatives_path / 'betas' / subject_name / 'func1pt8mm' / 'betas_fithrf_GLMdenoise_RR'
    path.mkdir(parents=True, exist_ok=True)
    with h5py.File(path / 'betas_sessions.hdf5', 'a') as f:

        sessions = subject_data['sessions']
        num_sessions = len(sessions)
        shape = sessions[0]['betas'].shape
        T, W, H, D = shape
        T_full = T * len(sessions)
        
        f.require_dataset('betas', shape=(T_full, W * H * D), dtype=np.int16, chunks=(T_full, 1))
        for i in range(W):
            Y = np.concatenate([
                session['betas'][:, i]
                for session in sessions
            ])
            slice_size = H * D
            f['betas'][:, slice_size * i:slice_size * (i + 1)] = rearrange(Y, 't ... -> t (...)')

subj01


KeyboardInterrupt: 

In [6]:
# Create the NaturalScenesDataset object

nsd = NaturalScenesDataset(dataset_path)

In [7]:
# Load image ids of the shared 1000 images across all participants
shared_1000_path = dataset_path / 'nsddata' / 'stimuli' / 'nsd' / 'shared1000.tsv'
shared_1000 = pd.read_csv(shared_1000_path, sep='\t', header=None)
shared_1000 = set(shared_1000[0])

In [15]:
# Generate a train-test-validation split
split_name = 'split-01'
N_test = 1000
N_validation = 1000

seed = 0

for subject_name, subject_data in nsd.subjects.items():
    responses = subject_data['responses']
    
    image_ids = responses['73KID'].to_numpy()
    unique_image_ids, unique_counts = np.unique(image_ids, return_counts=True)
    three_repetition_ids = unique_image_ids[unique_counts == 3]
    subject_data['three_repetition_ids'] = set(three_repetition_ids)
    print(f'{subject_name} {image_ids.shape=}, {len(three_repetition_ids)=}')
    
shared_1000_three_repetitions = set.intersection(
    shared_1000,
    *[subject_data['three_repetition_ids']
    for subject_data in nsd.subjects.values()]
)
print(f'{len(shared_1000_three_repetitions)=}')
N_non_shared = N_test - len(shared_1000_three_repetitions)


for subject_name, subject_data in nsd.subjects.items():
    three_repetition_ids = subject_data['three_repetition_ids']
    non_shared_three_repetition_ids = list(three_repetition_ids - shared_1000_three_repetitions)
    random.Random(seed).shuffle(non_shared_three_repetition_ids)
    
    test_image_ids = list(shared_1000_three_repetitions) + non_shared_three_repetition_ids[:N_non_shared]
    validation_image_ids = non_shared_three_repetition_ids[N_non_shared:(N_non_shared + N_validation)]
    subject_data['test_image_ids'] = np.array(test_image_ids)
    subject_data['validation_image_ids'] = np.array(test_image_ids)
    
    test_image_ids = set(test_image_ids)
    validation_image_ids = set(validation_image_ids)
    image_ids = subject_data['responses']['73KID'].to_numpy()
    subject_data['test_response_ids'] = np.argwhere([image_id in test_image_ids for image_id in image_ids])[:, 0]
    subject_data['validation_response_ids'] = np.argwhere([image_id in validation_image_ids for image_id in image_ids])[:, 0]

(derivatives_path / 'data_splits').mkdir(exist_ok=True, parents=True)
with h5py.File(derivatives_path / 'data_splits' / f'{split_name}.hdf5', 'w') as f:
    for subject_name, subject_data in nsd.subjects.items():
        subject = f.require_group(subject_name)
        
        three_repetition_ids = subject_data['three_repetition_ids']
        non_shared_three_repetition_ids = list(three_repetition_ids - shared_1000_three_repetitions)
        random.Random(seed).shuffle(non_shared_three_repetition_ids)

        test_image_ids = list(shared_1000_three_repetitions) + non_shared_three_repetition_ids[:N_non_shared]
        validation_image_ids = non_shared_three_repetition_ids[N_non_shared:(N_non_shared + N_validation)]
        subject['test_image_ids'] = np.array(test_image_ids)
        subject['validation_image_ids'] = np.array(test_image_ids)

        test_image_ids = set(test_image_ids)
        validation_image_ids = set(validation_image_ids)
        image_ids = subject_data['responses']['73KID'].to_numpy()
        subject['test_response_mask'] = np.array([image_id in test_image_ids for image_id in image_ids], dtype=bool)
        subject['validation_response_mask'] = np.array([image_id in validation_image_ids for image_id in image_ids], dtype=bool)


subj01 image_ids.shape=(27750,), len(three_repetition_ids)=8420
subj02 image_ids.shape=(27750,), len(three_repetition_ids)=8420
subj03 image_ids.shape=(21750,), len(three_repetition_ids)=5081
subj04 image_ids.shape=(20250,), len(three_repetition_ids)=4424
subj05 image_ids.shape=(27750,), len(three_repetition_ids)=8420
subj06 image_ids.shape=(21750,), len(three_repetition_ids)=5081
subj07 image_ids.shape=(27750,), len(three_repetition_ids)=8420
subj08 image_ids.shape=(20250,), len(three_repetition_ids)=4424
len(shared_1000_three_repetitions)=413


In [20]:
def require_dataset(group, name, data):
    group.require_dataset(name, shape=data.shape, dtype=data.dtype)
    group[name][:] = data

with h5py.File(nsd.dataset_path / 'derivatives/noise-ceiling.hdf5', 'a') as f:
    for subject_id in range(8):
        subject_name = f'subj0{subject_id + 1}'
        print(subject_name)
        
        subject = nsd.subjects[subject_name]
        train_mask = nsd.get_split(subject_name, 'split-01')[0]

        betas_h5 = subject['betas']
        responses = subject['responses']
        stimulus_ids = np.array(responses['73KID']) - 1
        stimulus_ids = stimulus_ids[train_mask]

        n = 3
        unique_ids, unique_counts = np.unique(stimulus_ids, return_counts=True)
        atleast_n_ids = unique_ids[unique_counts >= n]
        repetition_ids = np.stack([
            np.where(stimulus_ids == i)[0][:n]
            for i in atleast_n_ids
        ])

        num_betas, num_voxels = betas_h5['betas'].shape
        voxel_batch_size = 10000
        indices_batches = np.array_split(np.arange(num_voxels), num_voxels // voxel_batch_size)
        ncsnr = []

        for betas_indices in indices_batches:
            print(f'{betas_indices[-1]}/{num_voxels}, {betas_indices[-1] / num_voxels * 100:.1f}%')
            betas = nsd.load_betas(subject_name, betas_indices=betas_indices, return_tensor_dataset=False)[0]
            betas = betas[train_mask]
            ncsnr.append(compute_ncsnr_fast(betas, repetition_ids))
        ncsnr = np.concatenate(ncsnr)

        nc = compute_nc(ncsnr, num_averages=1)

        voxel_selection_path = 'derivatives/voxel-selection.hdf5'
        voxel_selection_key = 'nc/value'

        voxel_selection_file = h5py.File(nsd.dataset_path / voxel_selection_path, 'r')
        key = f'{subject_name}/{voxel_selection_key}'
        nc_original = voxel_selection_file[key][:]
        
        nc = nc.reshape(nc_original.shape)
        nc[np.isnan(nc)] = 0.
        grid = np.argwhere(np.ones_like(nc, dtype=bool))
        nc_sorted_indices_flat = nc.argsort(axis=None)[::-1].astype(int)
        nc_sorted_indices = grid[nc_sorted_indices_flat].astype(int)
        
        require_dataset(f, f'{subject_name}/split-01/value', nc)
        require_dataset(f, f'{subject_name}/split-01/sorted_indices_flat', nc_sorted_indices_flat)
        require_dataset(f, f'{subject_name}/split-01/sorted_indices', nc_sorted_indices)
        
        

subj01
10133/699192, 1.4%


  ncsnr = std_signal / std_noise


20267/699192, 2.9%
30401/699192, 4.3%
40535/699192, 5.8%


KeyboardInterrupt: 

In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"

nsd_path = Path('D:\\Datasets\\NSD')
stimuli_path = nsd_path / 'nsddata_stimuli' / 'stimuli' / 'nsd' / 'nsd_stimuli.hdf5'
stimulus_images = h5py.File(stimuli_path, 'r')['imgBrick']

In [24]:
# Load a clip model
import clip

print(clip.available_models())
model_name = 'ViT-B/32'
full_model, preprocess = clip.load(model_name, device=device)
model = full_model.visual

save_modules = {
    '': 'embedding'
}

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']


In [27]:
from functools import partial
#from tqdm.notebook import tqdm
from PIL import Image
from functools import partial
from typing import Sequence, Dict

out_path = dataset_path / 'derivatives' / 'stimulus_embeddings'
out_path.mkdir(exist_ok=True, parents=True)
modules = dict(model.named_modules())

with h5py.File(out_path / f"{model_name.replace('/', '=').replace('@', '-')}.hdf5", "a") as f:
    N = stimulus_images.shape[0]
    for stimulus_id in range(N):
        image_data = stimulus_images[stimulus_id]

        image = Image.fromarray(image_data)
        x = preprocess(image).unsqueeze(0).to(device) #.to(torch.float16)

        features = {}
        def forward_hook(module_name, module, x_in, x_out):
            if x_out.shape[0] == 1:
                x_out = x_out[0]
            features[module_name] = x_out.clone().cpu().float().numpy()
        hook_handles = []
        if isinstance(save_modules, Sequence):
            for module_name in save_modules:
                module = modules[module_name]
                hook_handle = module.register_forward_hook(partial(forward_hook, module_name))
                hook_handles.append(hook_handle)
        elif isinstance(save_modules, Dict):
            for module_name, feature_name in save_modules.items():
                module = modules[module_name]
                hook_handle = module.register_forward_hook(partial(forward_hook, feature_name))
                hook_handles.append(hook_handle)
        with torch.no_grad():
            model(x)
        for hook_handle in hook_handles:
            hook_handle.remove()
        for feature_name, feature in features.items():
            f.require_dataset(feature_name, (N, *feature.shape), feature.dtype)
            f[feature_name][stimulus_id] = feature
            
            
            
            

KeyboardInterrupt: 