In [361]:
### Analysis steps

# 1. For each neuron, calculate variance explained by the stimulus across repeats. Plot the SRV distribution for all neurons.

# 2. Shuffle responses (e.g., 100 random trials) to compute a baseline SRV. 

# 3. Keep neurons whose SRV is in the top 90th percentile of the shuffle distribution.

# 4. Apply PCA to SimCLR representations and/or raw image data

# 5. Choose a subset of principal components that explain a large proportion of the variance.

# 6. Ensure the number of features is not vastly greater than the number of neurons to reduce overfitting.

# 7. 80% training, 10% validation, 10% testing

# 8. Training: Learn weights for regression

# 9. Validation: Optimize the regularization parameter

# 10. Testing: Evaluate the final model

# 11. Predict the response of each neuron using SimCLR features as input.

# 12. Train one model per neuron (Input: SimCLR features (e.g., 512 features for final layer; Output: Neural response (scalar value for that neuron))

# 13. Aggregate results to evaluate overall prediction accuracy

In [None]:
### Load data

import numpy as np
from os import path

# imresps.npy is of shape (1573, 2, 15363), where 1573 is number of images, 2 repeats each, and 15363 neurons recorded
# stimids.npy has the image id (matching the image dataset ~selection1866~) for each stimulus number, 
# so of you want to see what image was presented on imresps[502] you would check stim_ids[502]

PATH_TO_DATA = '../../data/neural'

imresps = np.load(path.join(PATH_TO_DATA, 'imresps.npy'))
stimids = np.load(path.join(PATH_TO_DATA, 'stimids.npy'))

print(imresps.shape) # (1573, 2, 15363)
print(stimids.shape) # (1573,)

In [363]:
def compute_signal_related_variance(resp_a, resp_b, mean_center=True):
    """
    compute the fraction of signal-related variance for each neuron,
    as per Stringer et al Nature 2019. Cross-validated by splitting
    responses into two halves. Note, this only is "correct" if resp_a
    and resp_b are *not* averages of many trials.

    Args:
        resp_a (ndarray): n_stimuli, n_cells
        resp_b (ndarray): n_stimuli, n_cells

    Returns:
        fraction_of_stimulus_variance: 0-1, 0 is non-stimulus-caring, 1 is only-stimulus-caring neurons
        stim_to_noise_ratio: ratio of the stim-related variance to all other variance
    """
    if len(resp_a.shape) > 2:
        # if the stimulus is multi-dimensional, flatten across all stimuli
        resp_a = resp_a.reshape(-1, resp_a.shape[-1])
        resp_b = resp_b.reshape(-1, resp_b.shape[-1])
    ns, nc = resp_a.shape
    if mean_center:
        # mean-center the activity of each cell
        resp_a = resp_a - resp_a.mean(axis=0)
        resp_b = resp_b - resp_b.mean(axis=0)
    
    # compute the cross-trial stimulus covariance of each cell
    # dot-product each cell's (n_stim, ) vector from one half
    # with its own (n_stim, ) vector on the other half

    covariance = (resp_a * resp_b).sum(axis=0) / ns

    # compute the variance of each cell across both halves
    resp_a_variance = (resp_a**2).sum(axis=0) / ns
    resp_b_variance = (resp_b**2).sum(axis=0) / ns
    total_variance = (resp_a_variance + resp_b_variance) / 2

    if np.any(total_variance < 1e-12):
        print(f"Warning: Near-zero total variance for neurons: {np.where(total_variance < 1e-12)[0]}")

    # compute the fraction of the total variance that is
    # captured in the covariance
    fraction_of_stimulus_variance = covariance / total_variance

    # if you want, you can compute SNR as well:
    stim_to_noise_ratio = fraction_of_stimulus_variance / (
        1 - fraction_of_stimulus_variance
    )

    return fraction_of_stimulus_variance, stim_to_noise_ratio

In [None]:
### Filter neurons based on SRV

import matplotlib.pyplot as plt

# 1. First compute the real SRV for each neuron

# for each stimulus, randomly assign each repeat to spilt a or split b
split_A, split_B = [], []
for responses in imresps:
    indices = np.random.permutation(2)
    split_A.append(responses[indices[0]])
    split_B.append(responses[indices[1]])

split_A = np.vstack(split_A)
split_B = np.vstack(split_B)

real_srv_all_neurons, stim_to_noise_ratio = compute_signal_related_variance(split_A, split_B)
print(real_srv_all_neurons)

print('Image responses:', imresps.shape) # (1573, 2, 15363)
print('Image responses split A:', split_A.shape) # (1573, 15363)
print('Image responses split B:', split_B.shape) # (1573, 15363)
print('SRV:', real_srv_all_neurons.shape) # (15363,)
print('SNR:', stim_to_noise_ratio.shape) # (15363,)

# Plot SRV distribution
plt.hist(real_srv_all_neurons, bins=50, color='blue', alpha=0.7)
plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
plt.ylabel("Number of Neurons")
plt.title("Real SRV: SRV Across Neurons")
plt.show()

# 2. Compute null distribution of SRV values for all neurons
# Image responses: (1573, 2, 15363)
num_stimuli = imresps.shape[0] # 1573
num_repeats = imresps.shape[1] # 2
num_neurons = imresps.shape[2] # 15363
n_shuffles = 100

# shape (n_shuffles, num_neurons)
null_srv_all_neurons = []

for _ in range(n_shuffles):
    # Shuffle stimulus indices
    shuffled_indices = np.random.permutation(num_stimuli)
    shuffled_resps = imresps[shuffled_indices, :, :]  # Shuffle stimulus order

    # Split into two groups, maintaining random assignments across stimuli
    split_A = shuffled_resps[:, 0, :] # First repeat of shuffled stimuli
    split_B = shuffled_resps[:, 1, :] # Second repeat of shuffled stimuli

    # Compute SRV for the shuffled data - returns SRV for each neuron - shape (15363,)
    fraction_of_stimulus_variance, _ = compute_signal_related_variance(split_A, split_B)
    null_srv_all_neurons.append(fraction_of_stimulus_variance)

# Convert null distribution to numpy array for easier indexing
# shape (n_shuffles, num_neurons) - (100, 15363) - each value is the SRV for a neuron in a shuffle
null_srv_all_neurons = np.array(null_srv_all_neurons)
print(null_srv_all_neurons.shape)

# e.g. if neuron_index = 0, it will plot the SRV value for neuron 0 across all shuffles
neuron_index = 0
plt.hist([srv[neuron_index] for srv in null_srv_all_neurons], bins=100, color='blue', alpha=0.7)
plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
plt.ylabel("Number of Shuffles")
plt.title(f"Null Distribution of SRV for Neuron {neuron_index}")
plt.show()

# 3. Now filter our neurons whose real SRV is in the top 90th percentile of its null distribution

top_90th_percentile_null = np.percentile(null_srv_all_neurons, 90, axis=0)

# reliable_neurons contains the indices of neurons whose real SRV is statistically significant
reliable_neurons = np.where(real_srv_all_neurons >= top_90th_percentile_null)[0]
print('Filtered neurons in top 90th percentile of null distribution:', len(reliable_neurons))
print(reliable_neurons)

# Just check that the real SRV is greater than the null 90th percentile for all reliable neurons
for neuron in reliable_neurons:
    real_srv = real_srv_all_neurons[neuron]
    null_90th = top_90th_percentile_null[neuron]
    print(f"Neuron {neuron}: Real SRV = {real_srv}, Null 90th Percentile = {null_90th}")
    assert real_srv >= null_90th, f"Neuron {neuron} failed the check!"

# Plot for the first 5 reliable neurons
for neuron in reliable_neurons[:5]:
    plt.hist(null_srv_all_neurons[:, neuron], bins=100, color='blue', alpha=0.7, label='Null Distribution')
    plt.axvline(real_srv_all_neurons[neuron], color='red', linestyle='dashed', linewidth=2, label='Real SRV')
    plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
    plt.ylabel("Number of Shuffles")
    plt.title(f"Neuron {neuron}: Real SRV vs Null Distribution")
    plt.legend()
    plt.show()

In [None]:
### The SRV values are looking strange, need to sanity check the logic

# Null SRV shape: (100, 15363) - good
print("Null SRV shape:", null_srv_all_neurons.shape)

# Here the SRV values are all the same across all shuffles for the neuron, which is weird
# Null SRV for neuron 12: [0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232 0.00635232 0.00635232
#  0.00635232 0.00635232 0.00635232 0.00635232]
print("Null SRV for neuron 12:", null_srv_all_neurons[:, 12])

# Is shuffling logic correct? Indices should be different for each shuffle - looks correct
# Shuffle 0 indices (first 10): [ 240   85 1260  177  940  994  718  528  186  194]
# Shuffle 1 indices (first 10): [1038  925  615  186  938  793  558  454 1409  771]
# Shuffle 2 indices (first 10): [ 964  963  198  307   26 1055 1500  151  909 1486]
# Shuffle 3 indices (first 10): [1086 1040  964 1403 1093  175  340  262  773  783]
# Shuffle 4 indices (first 10): [ 637  230  958  787  392 1564  469  156 1407 1479]
for i in range(5):
    shuffled_indices = np.random.permutation(num_stimuli)
    print(f"Shuffle {i} indices (first 10): {shuffled_indices[:10]}")

shuffled_indices = np.random.permutation(num_stimuli)
shuffled_resps = imresps[shuffled_indices, :, :]

split_A = shuffled_resps[:, 0, :]  # First repeat
split_B = shuffled_resps[:, 1, :]  # Second repeat

# The two splits do seem to have unique values - looks correct
# Split B (first 5 neurons, first 5 stimuli): [[0.         0.6505735  0.05245936 0.2740306  0.10461244]
#  [0.01191196 0.         0.         0.         0.69875743]
#  [0.0321094  0.053065   0.63204853 0.09543178 0.81210066]
#  [0.68243115 0.         1.5167197  0.11282894 0.23435783]
#  [0.         0.09517568 0.00289441 0.42277867 0.24733859]]
# Split A (first 5 neurons, first 5 stimuli): [[1.02488546 0.1708228  0.38181075 0.5764056  0.08896205]
#  [1.94878199 1.01675761 1.0019273  0.22494237 0.        ]
#  [0.07141742 0.00980155 0.         0.63551881 0.39534339]
#  [0.77388581 0.31061662 1.48310569 0.19047658 0.        ]
#  [0.         0.68964413 0.         0.36437469 0.30127275]]
print("Split B (first 5 neurons, first 5 stimuli):", split_B[:5, :5])
print("Split A (first 5 neurons, first 5 stimuli):", split_A[:5, :5])

shuffled_indices = np.random.permutation(num_stimuli)
shuffled_resps = imresps[shuffled_indices, :, :]

split_A = shuffled_resps[:, 0, :]
split_B = shuffled_resps[:, 1, :]

# These values look strange - SRV values for shuffled data are negative or very close to zero
# [-0.02816453 -0.00743509 -0.03484814  0.00073648 -0.02779006  0.00868213 -0.01656676 -0.00235512 -0.05268748 -0.03571497]
test_srv, _ = compute_signal_related_variance(split_A, split_B)
print("Test SRV (first 10 neurons):", test_srv[:10])

# Let's test the compute_signal_related_variance function with some dummy data
split_A_test = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])  # Shape (3, 3)
split_B_test = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])  # Identical to split_A

# This does look correct, but there is a warning
# Test SRV (synthetic data): [1. 1. 1.]
# RuntimeWarning: divide by zero encountered in divide
  # stim_to_noise_ratio = fraction_of_stimulus_variance / (
test_srv, _ = compute_signal_related_variance(split_A_test, split_B_test)
print("Test SRV (synthetic data):", test_srv)

# Solution: I think in the shuffling when computing the null distributions (see copied code below),
# it shuffles the stimuli indices, but the 2 repeats for each stimuli always remain with their correct stimulus. 
# Thus e.g. split one contains repeat 1 for stimulus 1, and split two contains repeat 2 for stimulus 1,
# such that for each neuron, the response in split one for stimulus 1 (repeat 1) is still correlated 
# with the response in split two for stimulus 1 (repeat 2)
# "the two repeats for each stimulus remain paired to their original stimulus, even after shuffling the stimulus 
# indices. This means that responses in split_A and split_B for the same stimulus are still correlated, even though 
# the stimulus order has been shuffled globally. This defeats the purpose of the shuffle in creating a proper 
# null distribution because it preserves the correlation structure between the two splits for each stimulus."
shuffled_indices = np.random.permutation(num_stimuli)
shuffled_resps = imresps[shuffled_indices, :, :]  # Shuffle stimulus order

split_A = shuffled_resps[:, 0, :] # First repeat of shuffled stimuli
split_B = shuffled_resps[:, 1, :] # Second repeat of shuffled stimuli


In [None]:
### 1. Compute the null distribution of SRV values for all neurons (let's start from scratch and fix the shuffling logic)

# Am I right in thinking that, based on the compute_signal_related_variance function, the entire point is that 
# it should compare the neural responses to stimuli in split A with neural responses to stimuli in split B? 
# So at index N, the item in each split should relate to the SAME stimulus? So to compute the real SRV, we would do that.
# But to compute the null SRV, we DON'T want this. 
# We want: at index N, the item in each split relate to DISTINCT stimuli?

# Chatty:
# To compute the actual Signal-Related Variance (SRV), you need to compare responses to the same stimulus across repeats.
# This measures how consistent a neuron’s responses are to the same stimulus across two independent presentations (repeats)
# For split_A and split_B, at index 𝑁, the responses in both splits should correspond to repeat 1 and repeat 2 of the same stimulus 𝑁.
# To compute a null distribution of SRV values that reflects the variance when there is no true relationship between the stimulus and the neural responses.
# By randomizing the pairing of responses to stimuli, you break the relationship between the repeats for the same stimulus. 
# This ensures any observed SRV is due to chance, not a systematic stimulus-related response.
# For split_A and split_B, at index 𝑁, the responses in the two splits should correspond to different stimuli. 
# This breaks the natural relationship between repeats for the same stimulus.

# imresps shape = (1573, 2, 15363)
# responses in imresps shape = (2, 15363)

# 1. Compute the null distribution of SRV values for all neurons

null_srv_all_neurons = [] # shape (n_shuffles, num_neurons)

for _ in range(n_shuffles):
    # Shuffle stimulus indices *twice* to create two independent splits!
    shuffled_indices_A = np.random.permutation(num_stimuli)
    shuffled_indices_B = np.random.permutation(num_stimuli)

    # Now for the splits, we can just use fixed repeat indices, 
    # because for each split, at index N the responses correspond to different stimuli
    # e.g. split_A = [ stim_100_repeat_1, stim_2_repeat_1, stim_19_repeat_1, ... ]
    # e.g. split_B = [ stim_543_repeat_2, stim_345_repeat_2, stim_3_repeat_2, ... ]
    split_A = imresps[shuffled_indices_A, 0, :]
    split_B = imresps[shuffled_indices_B, 1, :]

    # Compute SRV for the shuffled data
    fraction_of_stimulus_variance, _ = compute_signal_related_variance(split_A, split_B)
    null_srv_all_neurons.append(fraction_of_stimulus_variance)

null_srv_all_neurons = np.array(null_srv_all_neurons)
null_srv_all_neurons.shape # (100, 15363)

print(null_srv_all_neurons[0])
print(null_srv_all_neurons[33])

# e.g. if neuron_index = 0, it will plot the SRV value for neuron 0 across all shuffles
neuron_index = 0
plt.hist([srv[neuron_index] for srv in null_srv_all_neurons], bins=100, color='blue', alpha=0.7)
plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
plt.ylabel("Number of Shuffles")
plt.title(f"Null Distribution of SRV for Neuron {neuron_index}")
plt.show()

In [None]:
### 2. Compute the real SRV for each neuron

split_A_real = imresps[:, 0, :] # First repeat for each stimulus
split_B_real = imresps[:, 1, :] # Second repeat for each stimulus

print(split_A_real.shape) # (1573, 15363)
print(split_B_real.shape) # (1573, 15363)

# Compute SRV for real data
real_srv_all_neurons, stim_to_noise_ratio = compute_signal_related_variance(split_A_real, split_B_real)

print(real_srv_all_neurons)
print(stim_to_noise_ratio)

print("Real SRV shape:", real_srv_all_neurons.shape) # Should be (15363,)

In [None]:
### 3. Filter neurons whose real SRV is in the top 90th percentile of its null distribution

# e.g. real_srv_all_neurons[12] = 0.25 (the real SRV for neuron 12)
# top_90th_percentile_null[12] = 0.20 (90th percentile of the null SRV for neuron 12)
# so the responses of neuron 12 are considered statistically significant at a 10% significance level (p < 0.1).
# It is considered to have stimulus-related variance that is unlikely to arise by chance
# Which means that the way neuron 12 responds to the stimulus is not random, but likely due to the stimulus

# This gives the 90th percentile of the null distribution for each neuron
# In other words the threshold for each neuron to be considered reliable
# e.g. if neuron 0 has a null distribution of SRVs [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], the threshold would be 0.9
top_90th_percentile_null = np.percentile(null_srv_all_neurons, 90, axis=0)
print(top_90th_percentile_null) # [0.03651716 0.03126347 0.03325775 ... 0.02738261 0.03546677 0.0333109 ]

# Get indices of reliable neurons
reliable_neuron_indices = np.where(real_srv_all_neurons >= top_90th_percentile_null)[0]

# Print results
print(f"Number of reliable neurons: {len(reliable_neuron_indices)}") # 5654
print(f"Indices of reliable neurons: {reliable_neuron_indices}") # [   14    29    48 ... 15357 15358 15360]