In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import numpy as np
import torch
import torchio as tio
import h5py
from ipywidgets import interact
import matplotlib.pyplot as plt

dir2 = os.path.abspath('..')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: 
    sys.path.append(dir1)

In [2]:
# Load kamitani2019 dataset (author format and preprocessing)

from research.data.kamitani_2019 import Kamitani2019H5Preprocessed
from research.data.kamitani_2017 import Kamitani2017H5Preprocessed

#features_model = 'biggan-128'
#features_model = 'bigbigan-resnet50'
#features_model = 'RN50'
features_model = 'ViT-B=32'
#features_model = 'vqgan'

config = '2019'

if config == '2019':
    root = "X:\\Datasets\\Deep-Image-Reconstruction\\derivatives\\kamitani-preprocessed"
    subject = 'sub-03'
    index_name = 'image_index'
    index_type = int
    training_repetitions = 5
    test_repetitions = 24
    dataset_class = Kamitani2019H5Preprocessed
elif config == '2017':
    root = "X:\\Datasets\\Generic-Object-Decoding\\"
    subject = 'Subject1'
    index_name = 'stimulus_name'
    index_type = float
    training_repetitions = 1
    test_repetitions = 35
    dataset_class = Kamitani2017H5Preprocessed

dataset = dataset_class(
    root, 
    subjects=[subject,], 
    func_sessions=['natural_training', 'natural_test'],
    features_path=f'X:\\Datasets\\Deep-Image-Reconstruction\\derivatives\\{features_model}-features.hdf5'
)

In [3]:
print(dataset.stimulus_info['natural_test'])

{1: '1443537.022563', 2: '1621127.019020', 3: '1677366.018182', 4: '1846331.017038', 5: '1858441.011077', 6: '1943899.024131', 7: '1976957.013223', 8: '2071294.046212', 9: '2128385.020264', 10: '2139199.010398', 11: '2190790.015121', 12: '2274259.024319', 13: '2416519.012793', 14: '2437136.012836', 15: '2437971.005013', 16: '2690373.007713', 17: '2797295.015411', 18: '2824058.018729', 19: '2882301.014188', 20: '2916179.024850', 21: '2950256.022949', 22: '2951358.023759', 23: '3064758.038750', 24: '3122295.031279', 25: '3124170.013920', 26: '3237416.058334', 27: '3272010.011001', 28: '3345837.012501', 29: '3379051.008496', 30: '3452741.024622', 31: '3455488.028622', 32: '3482252.022530', 33: '3495258.009895', 34: '3584254.005040', 35: '3626115.019498', 36: '3710193.022225', 37: '3716966.028524', 38: '3761084.043533', 39: '3767745.000109', 40: '3941684.021672', 41: '3954393.010038', 42: '4210120.009062', 43: '4252077.010859', 44: '4254777.016338', 45: '4297750.025624', 46: '4387400.01669

In [4]:
# Select ROI and feature layers

X_key = 'ROI_VC'

#Y_key = 'visual.layer4.7.bn3'
#Y_key = 'visual'
#Y_key = 'layer4.2.add'
#Y_key = 'z_mean'
#Y_key = 'attnpool.getitem_8'
Y_key = 'embedding'
#Y_key = 'vqgan-f16-1024-latent'
#Y_key = 'z'
#Y_key = 'y_embedding'

classification = False
data = dataset.get_data(brain_keys=[X_key, index_name], feature_keys=[Y_key])
#Y_shape = data[subject]['natural_training'][Y_key].shape[1:]

In [5]:
# Apply preprocessing

from sklearn.model_selection import train_test_split


def preprocess(
    session_data, 
    X_key, 
    Y_key, 
    max_features=None, 
    average_repetitions=False, 
    num_repetitions=None, 
    split=None,
    seed=0
):
    np.random.seed(seed)
    
    if len(session_data[Y_key].shape) > 2:
        session_data[Y_key] = torch.flatten(torch.from_numpy(session_data[Y_key]), start_dim=1).numpy()
    
    if max_features is not None:
        if session_data[Y_key].shape[1] > max_features:
            choice = np.random.choice(max_features, size=max_features)
            session_data[Y_key] = session_data[Y_key][:, choice]
        
    image_index = session_data[index_name].astype(index_type)[:, 0]
    X = session_data[X_key]
    Y = session_data[Y_key]
    
    N = X.shape[0]
    if average_repetitions:
        assert num_repetitions is not None
        
        sorted_indices = np.argsort(image_index)
        X = X[sorted_indices]
        Y = Y[sorted_indices]
        
        N = N // num_repetitions
        X = np.stack([x.mean(axis=0) for x in np.split(X, N)])
        Y = np.stack([y[0] for y in np.split(Y, N)])
        image_index = np.arange(1, N + 1)
        
    if split:
        unique_indices, inverse_indices = np.unique(image_index, return_inverse=True)

        train_indices, _ = train_test_split(unique_indices, train_size=split)

        train_indices = set(train_indices)
        train_mask = np.array([i in train_indices for i in image_index])
        test_mask = ~train_mask

        N_train = train_mask.sum()
        N_test = N - N_train
        
        X_train, Y_train = X[train_mask], Y[train_mask]
        X_test, Y_test = X[test_mask], Y[test_mask]
        
        return X_train, Y_train, X_test, Y_test

    return X, Y

#max_features = None
max_features = 1024
X_train, Y_train, X_val, Y_val = preprocess(data[subject]['natural_training'], 
                                            max_features=max_features, X_key=X_key, Y_key=Y_key, 
                                            average_repetitions=False, num_repetitions=training_repetitions,
                                            split=0.8)

X_test, Y_test = preprocess(data[subject]['natural_test'], 
                            max_features=max_features, X_key=X_key, Y_key=Y_key, 
                            average_repetitions=True, num_repetitions=test_repetitions,)


In [6]:
print([item.shape for item in (X_train, Y_train, X_val, Y_val, X_test, Y_test)])

[(4800, 13099), (4800, 512), (1200, 13099), (1200, 512), (50, 13099), (50, 512)]


In [7]:
# Normalization
from sklearn.preprocessing import StandardScaler


X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

break
if classification:
    pass
else:
    Y_scaler = StandardScaler()
    Y_train = Y_scaler.fit_transform(Y_train)
    Y_val = Y_scaler.transform(Y_val)
    Y_test = Y_scaler.transform(Y_test)

SyntaxError: 'break' outside loop (Temp/ipykernel_14228/2025871939.py, line 10)

In [8]:
import scipy

def pearsonr(Y, Y_pred, axis=0):
    Y = Y.astype(np.double)
    Y_pred = Y_pred.astype(np.double)
    
    Y = Y - Y.mean(axis=axis, keepdims=True)
    Y_pred = Y_pred - Y_pred.mean(axis=axis, keepdims=True)
    
    Y = Y / scipy.linalg.norm(Y, axis=axis,  keepdims=True)
    Y_pred = Y_pred / scipy.linalg.norm(Y_pred, axis=axis,  keepdims=True)
    
    return (Y * Y_pred).sum(axis=axis).mean()

def cosine_similarity(Y, Y_pred, axis=0):
    Y = Y.astype(np.double)
    Y_pred = Y_pred.astype(np.double)
    
    Y = Y / scipy.linalg.norm(Y, axis=axis, keepdims=True)
    Y_pred = Y_pred / scipy.linalg.norm(Y_pred, axis=axis,  keepdims=True)
    
    return (Y * Y_pred).sum(axis=axis).mean()

def pearsonr_scipy(Y, Y_pred):
    r = []
    for i in range(Y.shape[1]):
        r.append(scipy.stats.pearsonr(Y[:, i], Y_pred[:, i])[0])
    return np.mean(r)
    
def r2_score(Y, Y_pred):
    ssr = ((Y - Y_pred) ** 2).sum(axis=0)
    sst = ((Y - Y.mean(axis=0, keepdims=True)) ** 2).sum(axis=0)
    return (1 - ssr / sst).mean()

In [9]:
# Fit ridge regression with the FastL2LiR from the Kamitani group.
from sklearn.metrics import r2_score
from fastl2lir import FastL2LiR

model = FastL2LiR()
model.fit(X_train, Y_train, alpha=100.0, n_feat=500)

Y_train_pred = model.predict(X_train)
Y_val_pred = model.predict(X_val)
Y_test_pred = model.predict(X_test)

print('r-batch', pearsonr(Y_train, Y_train_pred), pearsonr(Y_val, Y_val_pred), pearsonr(Y_test, Y_test_pred))
print('r-row', pearsonr(Y_train, Y_train_pred, axis=1), pearsonr(Y_val, Y_val_pred, axis=1), pearsonr(Y_test, Y_test_pred, axis=1))
print('cs-batch', cosine_similarity(Y_train, Y_train_pred), cosine_similarity(Y_val, Y_val_pred), cosine_similarity(Y_test, Y_test_pred))
print('cs-row', cosine_similarity(Y_train, Y_train_pred, axis=1), cosine_similarity(Y_val, Y_val_pred, axis=1), cosine_similarity(Y_test, Y_test_pred, axis=1))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 512/512 [00:10<00:00, 47.56it/s]


r-batch 0.44890961979031957 0.1674159272244446 0.2572398542703962
r-row 0.7875615338259986 0.7143328736393799 0.7445208071891645
cs-batch 0.5349494151022156 0.3170199659361004 0.38590073881393633
cs-row 0.7873143325248458 0.7139014140473315 0.7441747255531432


In [None]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from tqdm.notebook import tqdm

pipelines = []
for i in tqdm(range(Y_train.shape[1])):
    pipeline = Pipeline([
        ('feature_selection', SelectKBest(f_regression, k=500)),
        ('model', Ridge(alpha=100.))
        #('model', SVR())
    ])
    pipeline.fit(X_train, Y_train[:, i])
    pipelines.append(pipeline)

Y_train_pred = np.stack([pipeline.predict(X_train) for pipeline in pipelines], axis=1)
Y_val_pred = np.stack([pipeline.predict(X_val) for pipeline in pipelines], axis=1)
Y_test_pred = np.stack([pipeline.predict(X_test) for pipeline in pipelines], axis=1)

print('r', pearsonr(Y_train, Y_train_pred), pearsonr(Y_val, Y_val_pred), pearsonr(Y_test, Y_test_pred))

In [11]:
def get_top_k_correlation_indices(X, Y, k):
    X = X / np.linalg.norm(X, axis=0, keepdims=True)
    Y = Y / np.linalg.norm(Y, axis=0, keepdims=True)
    
    return np.stack([
        np.abs(X.T @ Y[:, i]).argsort(axis=0)[-k:]
        for i in range(Y.shape[1])
    ])

top_k_correlation_indices = get_top_k_correlation_indices(X_train, Y_train, 500)

  return sqrt(add.reduce(s, axis=axis, keepdims=keepdims))


In [None]:
models = []
for i in tqdm(range(Y_train.shape[1])):
    model = Ridge(alpha=100.)
    model.fit(X_train[:, top_k_correlation_indices[i]], Y_train[:, i])
    models.append(model)

Y_train_pred = np.stack([model.predict(X_train[:, top_k_correlation_indices[i]]) for i, model in enumerate(models)], axis=1)
Y_val_pred = np.stack([model.predict(X_val[:, top_k_correlation_indices[i]]) for i, model in enumerate(models)], axis=1)
Y_test_pred = np.stack([model.predict(X_test[:, top_k_correlation_indices[i]]) for i, model in enumerate(models)], axis=1)

print('r', pearsonr(Y_train, Y_train_pred), pearsonr(Y_val, Y_val_pred), pearsonr(Y_test, Y_test_pred))

In [13]:
# Deep learning 
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.optim import Adam
from torch.utils.data import DataLoader


class SparseLayer(nn.Linear):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        selection_indices: torch.Tensor,
        **kwargs
    ):
        super().__init__(in_features, out_features, **kwargs)
        assert out_features == selection_indices.shape[0]
        self.register_buffer('selection_indices', selection_indices)

    def forward(self, x: torch.Tensor):
        x = x[:, self.selection_indices]
        x = x * self.weight
        x = x.sum(dim=2)
        return x


class CosineSimilarityLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.cosine_similarity = nn.CosineSimilarity(dim=1)
        
    def forward(self, x, y):
        return 1. - self.cosine_similarity(x, y).mean()


def pearsonr_torch(Y, Y_pred, dim=0):
    Y = Y.to(torch.float64)
    Y_pred = Y_pred.to(torch.float64)
    
    Y = Y - Y.mean(dim=dim, keepdim=True)
    Y_pred = Y_pred - Y_pred.mean(dim=dim, keepdim=True)
    
    Y = Y / torch.norm(Y, dim=dim, keepdim=True)
    Y_pred = Y_pred / torch.norm(Y_pred, dim=dim, keepdim=True)
    
    return (Y * Y_pred).sum(dim=dim).mean().item()


model = nn.Sequential(
    SparseLayer(in_features=500, out_features=512, selection_indices=torch.from_numpy(top_k_correlation_indices)),
    nn.ReLU(),
    torch.nn.Dropout(p=0.5, inplace=False),
    nn.Linear(512, 512),
)

#criterion = CosineSimilarityLoss()
#riterion = nn.MSELoss()
#criterion = nn.L1Loss()

optimizer = Adam(params=model.parameters(), lr=0.1)
dataset_training = TensorDataset(torch.from_numpy(X_train).float(), 
                                 torch.from_numpy(Y_train).float())
dataset_validation = TensorDataset(torch.from_numpy(X_val).float(), 
                                   torch.from_numpy(Y_val).float())
dataset_test = TensorDataset(torch.from_numpy(X_test).float(), 
                             torch.from_numpy(Y_test).float())

device = torch.device('cuda')
model.to(device)

Sequential(
  (0): SparseLayer(in_features=500, out_features=512, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=512, out_features=512, bias=True)
)

In [15]:
training_dataloader = DataLoader(dataset_training, shuffle=True, batch_size=128)

def get_data_iterator(loader):
    while True:
        for batch in loader:
            yield batch
            
def run_all(dataset):
    return torch.cat([model(x.to(device)[None]) for x, _ in dataset]).cpu()

training_data_iterator = get_data_iterator(training_dataloader)

max_iterations = 1500
for i in range(max_iterations):
    x, y = next(training_data_iterator)
    x = x.to(device)
    y = y.to(device)

    model.train()
    y_pred = model(x)
    loss = criterion(y, y_pred)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    model.eval()
    
    if i % 250 == 0:
        model.eval()
        with torch.no_grad():
            Y_train_pred = run_all(dataset_training)
            Y_val_pred = run_all(dataset_validation)
            Y_test_pred = run_all(dataset_test)
        
        print('r-batch', 
              pearsonr_torch(dataset_training.tensors[1], Y_train_pred), 
              pearsonr_torch(dataset_validation.tensors[1], Y_val_pred), 
              pearsonr_torch(dataset_test.tensors[1], Y_test_pred))
        print('r-row', 
              pearsonr_torch(dataset_training.tensors[1], Y_train_pred, dim=1), 
              pearsonr_torch(dataset_validation.tensors[1], Y_val_pred, dim=1), 
              pearsonr_torch(dataset_test.tensors[1], Y_test_pred, dim=1))

r-batch 0.5547963927015711 0.24145324461490764 0.3080806872471269
r-row 0.8348170976119066 0.7404708566743083 0.7520718797145054
r-batch 0.5652137497440708 0.23906500646119147 0.3110986830628377
r-row 0.8402669858749717 0.7400986929005368 0.7513049790806676
r-batch 0.5754995827219875 0.23889647244966924 0.31220508174371564
r-row 0.8439707166373858 0.7392252709719693 0.751670731993461
r-batch 0.5834982405064885 0.23727718407131712 0.3106389147329606
r-row 0.8480051379346375 0.7386184315242165 0.7503096790812159
r-batch 0.5895570707535469 0.23670333718632613 0.31070251080419564
r-row 0.8507802937507634 0.737970206581787 0.7498160205458423
r-batch 0.5936144926383666 0.23524993645301634 0.3073936595096415
r-row 0.8527225712091138 0.7379314531259841 0.7504854012051848


In [None]:
y_pred

In [None]:
from pathlib import Path

out_path = Path('X:\\Datasets\\Deep-Image-Reconstruction\\derivatives\\')

features_model

np.save(out_path / f'{features_model}__{Y_key}__{subject}__test-prediction__v2.npy', Y_scaler.inverse_transform(Y_test_pred).reshape(50, 128))#
np.save(out_path / f'{features_model}__{Y_key}__{subject}__test__v2.npy', Y_scaler.inverse_transform(Y_test).reshape(50, 128))#*Y_shape))

In [10]:
from pathlib import Path

out_path = Path('X:\\Datasets\\Deep-Image-Reconstruction\\derivatives\\')

features_model

np.save(out_path / f'{features_model}__{Y_key}__{subject}__test-prediction__v2.npy', Y_test_pred.reshape(50, 512))#
np.save(out_path / f'{features_model}__{Y_key}__{subject}__test__v2.npy', Y_test.reshape(50, 512))#*Y_shape))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from tqdm.notebook import tqdm

pipelines = []
for i in tqdm(range(Y_train.shape[1])):
    pipeline = Pipeline([
        ('feature_selection', SelectKBest(f_classif, k=500)),
        ('model', LogisticRegression(solver='liblinear'))
    ])
    pipeline.fit(X_train, Y_train[:, i])
    pipelines.append(pipeline)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from tqdm.notebook import tqdm

select_k_best = SelectKBest(f_classif, k=5)
X_train_selection = select_k_best.fit_transform(X_train, Y_train[:, 0])

model = LogisticRegression(solver='saga', C=1000.0)
model.fit(X_train_selection, Y_train[:, 0])

X_test_selection = select_k_best.transform(X_test,)
print(model.score(X_train_selection, Y_train[:, 0]), model.score(X_test_selection, Y_test[:, 0]))

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score

df = pd.read_pickle('G:\\Github Repositories\\GenericObjectDecoding\\code\\python\\results\\GenericObjectDecoding.pkl')
#print(df)
df = df[df['feature'] == 'cnn8']

true_feature_averaged_percept = df['true_feature_averaged_percept'].array[0]
predicted_feature_averaged_percept = df['predicted_feature_averaged_percept'].array[0]
print(predicted_feature_averaged_percept.shape)

print(r2_score(true_feature_averaged_percept, predicted_feature_averaged_percept))
print(pearsonr(true_feature_averaged_percept, predicted_feature_averaged_percept))

In [None]:
Y_stuff = Y_train.reshape(4800, *Y_shape)
Y_stuff.std(axis=(0, 1))