# Loading the package

This notebook will walk you through a simple usecase of Neuroprobe and evaluation of the logistic regression baseline. It can be easily adapted to evaluate any foundation model of neural activity.

In [12]:
import os
# NOTE: Change this to your own path, or define an environment variable elsewhere
os.environ['ROOT_DIR_BRAINTREEBANK'] = r'C:\Users\User\Documents\UNI\BRAIN\braintree' 

import torch
import neuroprobe.config as neuroprobe_config

# Make sure the config ROOT_DIR is set correctly
print("Expected braintreebank data at:", neuroprobe_config.ROOT_DIR)
print("Sampling rate:", neuroprobe_config.SAMPLING_RATE, "Hz")

Expected braintreebank data at: C:\Users\User\Documents\UNI\BRAIN\braintree
Sampling rate: 2048 Hz


## The BrainTreebankSubject Class

In [9]:
from neuroprobe import BrainTreebankSubject

subject_id = 3

coordinates_type = "cortical" # "cortical", "mni", "lpi". NOTE: MNI are not yet available for the Braintreebank dataset.
# cortical = standardized brain atlas cortical projection of the coordinates in Freesurfer space
# mni = MNI coordinates
# lpi = LPI coordinates (left, posterior, inferior) in the subject's coordinate system

# use cache=True to load this trial's neural data into RAM, if you have enough memory!
# It will make the loading process faster.
subject = BrainTreebankSubject(subject_id, allow_corrupted=False, cache=True, dtype=torch.float32, coordinates_type=coordinates_type)
print("Loaded subject", subject_id)
print("Electrode labels (first 10):", subject.electrode_labels[:10]) # list of electrode labels

print("\nElectrode coordinates (MNI space) of the first 10 electrodes:")
print(subject.get_electrode_coordinates()[:10]) # L, P, I coordinates of the electrodes

# Optionally, subset the electrodes to a specific set of electrodes. NOTE: you should not do this if you are using the neuroprobe as a standardized benchmark.
# subject.set_electrode_subset(['F3aOFa2', 'F3aOFa3', 'F3aOFa4', 'F3aOFa7']) # if you change this line when using cache=True, you need to clear the cache after: subject.clear_neural_data_cache()
# print("Electrode labels after subsetting:", subject.electrode_labels)

Loaded subject 3
Electrode labels (first 10): ['F2Ia1', 'F2Ia2', 'F2Ia3', 'F2Ia4', 'F2Ia5', 'F2Ia6', 'F2Ia7', 'F2Ia8', 'F2Ia9', 'F2Ia10']

Electrode coordinates (MNI space) of the first 10 electrodes:
tensor([[ 52.0551, -49.1958,  -0.2968],
        [ 52.4313, -47.6345,  -2.4238],
        [ 54.8303, -43.2729,  -5.6083],
        [ 55.2545, -38.4559,  -7.9876],
        [ 57.9486, -35.4795,  -9.0304],
        [ 59.5240, -33.7188,  -9.7763],
        [ 61.0274, -31.4587, -10.8302],
        [ 62.1906, -28.7069, -12.2043],
        [ 83.5639, -10.3173, -16.6571],
        [ 82.7500,  -7.8472, -16.6011]])


Loading the electrode data from a specific trial:

In [10]:
trial_id = 0

subject.load_neural_data(trial_id)
window_from = None # This is the index into the neural data array from where to start loading the data.
window_to = None # if None, the whole trial will be loaded

all_neural_data = subject.get_all_electrode_data(trial_id, window_from=window_from, window_to=window_to)

print("All neural data shape:")
print(all_neural_data.shape) # (n_electrodes, n_samples). To get the data for a specific electrode, use subject.get_electrode_data(trial_id, electrode_label)

print("\nFirst 50 samples of the first electrode (data is in microvolts):")
print(all_neural_data[0, :50])

All neural data shape:
torch.Size([112, 14017056])

First 50 samples of the first electrode (data is in microvolts):
tensor([60.3463, 61.4097, 56.6245, 57.6879, 59.5488, 59.8146, 58.4854, 55.8270,
        57.1562, 59.2829, 59.5488, 57.6879, 58.2195, 60.6121, 59.2829, 61.4097,
        63.5364, 61.6755, 61.1438, 61.6755, 61.4097, 60.3463, 60.0804, 59.2829,
        61.1438, 60.0804, 61.1438, 63.2706, 63.8022, 61.4097, 60.0804, 63.2706,
        61.9413, 62.2072, 64.0681, 64.8656, 63.5364, 63.2706, 66.1948, 66.1948,
        66.4607, 66.7265, 66.1948, 65.9290, 68.5874, 69.1191, 66.1948, 70.1825,
        69.6508, 68.3216])


## The BrainTreebankSubjectTrialBenchmarkDataset Class

NOTE: In the dataset below, there will be fewer electrodes than in the full subject data. This is because the Neuroprobe benchmark only uses a subset of the electrodes for standardized and quick benchmarking. The electrode labels below are subset to the `neuroprobe_config.NEUROPROBE_LITE_ELECTRODES` list.

Accordingly, when using the `BrainTreebankSubjectTrialBenchmarkDataset` with `lite=True` (which is the default Neuroprobe benchmark option), make sure that you use the `dataset.electrode_labels` and `dataset.electrode_coordinates` properties, which give the electrode labels and the electrode coordinates in MNI space, respectively, in the exact order that the `dataset` will output the data tensors in.

In [14]:
from neuroprobe import BrainTreebankSubjectTrialBenchmarkDataset

# Options for eval_name (from the Neuroprobe paper): neuroprobe_config.EVAL_NAMES
#   frame_brightness, global_flow, local_flow, face_num, volume, pitch, delta_volume, 
#   speech, onset, scene_onset, gpt2_surprisal, word_length, word_gap, word_index, word_head_pos, word_part_speech, speaker
eval_name = "scene_onset"  # Try "scene_onset" to detect scene changes in the movie

# if True, the dataset will output the indices of the samples in the neural data in a tuple: (index_from, index_to); 
# if False, the dataset will output the neural data directly
output_indices = False

start_neural_data_before_word_onset = 0 # the number of samples to start the neural data before each word onset
end_neural_data_after_word_onset = neuroprobe_config.SAMPLING_RATE * 1 # the number of samples to end the neural data after each word onset -- here we use 1 second

dataset = BrainTreebankSubjectTrialBenchmarkDataset(subject, trial_id, dtype=torch.float32, eval_name=eval_name, output_indices=output_indices, 
                                                    start_neural_data_before_word_onset=start_neural_data_before_word_onset, end_neural_data_after_word_onset=end_neural_data_after_word_onset,
                                                    lite=True) # the default is Neuroprobe Lite for standardized and quick benchmarking. Feel free to set lite=false if trying to access the Full dataset.
# P.S. Allow partial cache -- whether to allow partial caching of the neural data, if only part of it is needed for this particular dataset. Better set to False when doing multiple evals back to back, but better set to True when doing a single eval.

data_electrode_labels = dataset.electrode_labels # NOTE: this is different from the subject.electrode_labels! Neuroprobe uses a special subset of electrodes in this exact order.
data_electrode_coordinates = dataset.electrode_coordinates 

print("Items in the dataset:", len(dataset), "\n")
print(f"The first item: (shape = {dataset[0][0].shape})", dataset[0][0], f"label = {dataset[0][1]}", sep="\n")
print("")
print(f"Electrode labels in the data above in the following order ({len(data_electrode_labels)} electrodes):", data_electrode_labels)
print(f"Electrode coordinates in the data above in the following order ({len(data_electrode_coordinates)} electrodes):", data_electrode_coordinates)

AssertionError: eval_name must be one of ['enhanced_pitch', 'rms', 'mean_pixel_brightness', 'max_global_magnitude', 'max_vector_magnitude', 'delta_rms', 'gpt2_surprisal', 'word_length', 'pitch', 'volume', 'frame_brightness', 'global_flow', 'local_flow', 'delta_volume', 'gpt2_surprisal', 'word_length', 'enhanced_pitch', 'enhanced_volume', 'delta_enhanced_pitch', 'delta_enhanced_volume', 'raw_pitch', 'raw_volume', 'delta_raw_pitch', 'delta_raw_volume', 'onset', 'speech', 'face_num', 'word_gap', 'word_index', 'bin_head', 'pos', 'word_head_pos', 'word_part_speech'], not scene_onset

In [5]:
# Optionally, you can request the output_dict=True to get the data as a dictionary with a bunch of metadata.
dataset.output_dict = True
print(dataset[0])

dataset.output_dict = False # set it back

{'data': tensor([[-34.0279, -33.4962, -32.4328,  ...,  21.0016,  22.0649,  22.0649],
        [ 10.6337,   9.8362,   9.3045,  ..., -34.0279, -33.4962, -32.6987],
        [ 59.2829,  59.2829,  63.0047,  ..., -26.0526, -25.2551, -23.6600],
        ...,
        [-42.5348, -42.5348, -43.3324,  ...,  -4.2535,  -5.3169,  -4.5193],
        [-16.2164, -14.8872, -14.8872,  ..., -18.3431, -16.2164, -15.9506],
        [ -0.7975,   0.2658,  -3.1901,  ...,   1.3292,   4.5193,   4.5193]]), 'label': 1, 'electrode_labels': ['T1cIe5', 'T1cIe6', 'T1cIe7', 'T1cIe8', 'T1cIe9', 'T1cIe10', 'T1cIe11', 'T1cIe12', 'T1b1', 'T1b2', 'T1b3', 'T1b4', 'T1b5', 'T1b6', 'T1aIc3', 'T1aIc4', 'T1aIc5', 'T1aIc6', 'O1aIb9', 'O1aIb10', 'O1aIb11', 'O1aIb12', 'O1aIb13', 'O1aIb14', 'O1aIb15', 'O1aIb16', 'F3d1', 'F3d2', 'F3d3', 'F3d4', 'F3d5', 'F3d6', 'F3d7', 'F3d8', 'F3d9', 'F3d10', 'F3aOF1', 'F3aOF2', 'F3aOF3', 'F3aOF4', 'F3aOF5', 'F3aOF6', 'F3aOF7', 'F3aOF8', 'F3c1', 'F3c2', 'F3c3', 'F3c4', 'F3c5', 'F3c6', 'F2Ia1', 'F2Ia2', 'F

In [6]:
# Also, you can request only the indices into the neural data array, instead of the actual data.
# NOTE: These are the indices into the data as in the raw h5 files in the braintreebank dataset.

dataset.output_indices = True
print(dataset[0]) # Data format: (index_from, index_to), label

dataset.output_indices = False # set it back

((848237, 850285), 1)


## Train/Test Splits

In this example, we generate train/test splits for the WithinSession evaluation.

All options: generate_splits_within_session, generate_splits_cross_session, generate_splits_cross_subject

In [9]:
import neuroprobe.train_test_splits as neuroprobe_train_test_splits

folds = neuroprobe_train_test_splits.generate_splits_within_session(subject, trial_id, eval_name, dtype=torch.float32, 
                                                                                # Put the dataset parameters here
                                                                                output_indices=output_indices, start_neural_data_before_word_onset=start_neural_data_before_word_onset, end_neural_data_after_word_onset=end_neural_data_after_word_onset,
                                                                                lite=True)
print("len(folds) = k_folds =", len(folds))
folds

len(folds) = k_folds = 2


[{'train_dataset': <torch.utils.data.dataset.Subset at 0x205dc62fe80>,
  'test_dataset': <torch.utils.data.dataset.Subset at 0x205dc62e9b0>},
 {'train_dataset': <torch.utils.data.dataset.Subset at 0x205dc62eb30>,
  'test_dataset': <torch.utils.data.dataset.Subset at 0x205dc62fc40>}]

## Example Linear Regression on SS_SM

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

for fold_idx, fold in enumerate(folds):
    print(f"Fold {fold_idx+1} of {len(folds)}")
    train_dataset = fold["train_dataset"]
    test_dataset = fold["test_dataset"]

    # Convert PyTorch dataset to numpy arrays for scikit-learn
    X_train = np.array([item[0].flatten() for item in train_dataset])
    y_train = np.array([item[1] for item in train_dataset])
    X_test = np.array([item[0].flatten() for item in test_dataset])
    y_test = np.array([item[1] for item in test_dataset])

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train logistic regression
    clf = LogisticRegression(random_state=42, max_iter=1000, tol=1e-3)
    clf.fit(X_train, y_train)

    # Evaluate model
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(f"\t Train accuracy: {train_score:.3f} | Test accuracy: {test_score:.3f}")

Fold 1 of 2
	 Train accuracy: 1.000 | Test accuracy: 0.618
Fold 2 of 2
	 Train accuracy: 1.000 | Test accuracy: 0.613


In [21]:
y_test

array([1, 0, 1, ..., 0, 1, 0])

In [20]:
np.log10(X_train.size * 8)

9.494884485922654