In [2]:
import numpy as np
from os import path
import matplotlib.pyplot as plt

In [None]:
### 1.1. Load data

# imresps.npy is of shape (1573, 2, 15363), where 1573 is number of images, 2 repeats each, and 15363 neurons recorded
# stimids.npy has the image id (matching the image dataset ~selection1866~) for each stimulus number, 
# so of you want to see what image was presented on imresps[502] you would check stim_ids[502]

PATH_TO_DATA = '../../data/neural'

imresps = np.load(path.join(PATH_TO_DATA, 'imresps.npy'))
stimids = np.load(path.join(PATH_TO_DATA, 'stimids.npy'))

print(imresps.shape) # (1573, 2, 15363)
print(stimids.shape) # (1573,)

In [4]:
def compute_signal_related_variance(resp_a, resp_b, mean_center=True):
    """
    compute the fraction of signal-related variance for each neuron,
    as per Stringer et al Nature 2019. Cross-validated by splitting
    responses into two halves. Note, this only is "correct" if resp_a
    and resp_b are *not* averages of many trials.

    Args:
        resp_a (ndarray): n_stimuli, n_cells
        resp_b (ndarray): n_stimuli, n_cells

    Returns:
        fraction_of_stimulus_variance: 0-1, 0 is non-stimulus-caring, 1 is only-stimulus-caring neurons
        stim_to_noise_ratio: ratio of the stim-related variance to all other variance
    """
    if len(resp_a.shape) > 2:
        # if the stimulus is multi-dimensional, flatten across all stimuli
        resp_a = resp_a.reshape(-1, resp_a.shape[-1])
        resp_b = resp_b.reshape(-1, resp_b.shape[-1])
    ns, nc = resp_a.shape
    if mean_center:
        # mean-center the activity of each cell
        resp_a = resp_a - resp_a.mean(axis=0)
        resp_b = resp_b - resp_b.mean(axis=0)
    
    # compute the cross-trial stimulus covariance of each cell
    # dot-product each cell's (n_stim, ) vector from one half
    # with its own (n_stim, ) vector on the other half

    covariance = (resp_a * resp_b).sum(axis=0) / ns

    # compute the variance of each cell across both halves
    resp_a_variance = (resp_a**2).sum(axis=0) / ns
    resp_b_variance = (resp_b**2).sum(axis=0) / ns
    total_variance = (resp_a_variance + resp_b_variance) / 2

    if np.any(total_variance < 1e-12):
        print(f"Warning: Near-zero total variance for neurons: {np.where(total_variance < 1e-12)[0]}")

    # compute the fraction of the total variance that is
    # captured in the covariance
    fraction_of_stimulus_variance = covariance / total_variance

    # if you want, you can compute SNR as well:
    stim_to_noise_ratio = fraction_of_stimulus_variance / (
        1 - fraction_of_stimulus_variance
    )

    return fraction_of_stimulus_variance, stim_to_noise_ratio

In [None]:
### 2.1. Compute the null distribution of SRV values for all neurons

# TODO: double check INDEXING (images, cells)

# imresps shape = (1573, 2, 15363)
# responses in imresps shape = (2, 15363)
num_stimuli = imresps.shape[0] # 1573
num_repeats = imresps.shape[1] # 2
num_neurons = imresps.shape[2] # 15363
n_shuffles = 100

null_srv_all_neurons = [] # shape (n_shuffles, num_neurons)

for _ in range(n_shuffles):
    # Shuffle stimulus indices *twice* to create two independent splits!
    shuffled_indices_A = np.random.permutation(num_stimuli)
    shuffled_indices_B = np.random.permutation(num_stimuli)

    # Now for the splits, we can just use fixed repeat indices, 
    # because for each split, at index N the responses correspond to different stimuli
    # e.g. split_A = [ stim_100_repeat_1, stim_2_repeat_1, stim_19_repeat_1, ... ]
    # e.g. split_B = [ stim_543_repeat_2, stim_345_repeat_2, stim_3_repeat_2, ... ]
    split_A = imresps[shuffled_indices_A, 0, :]
    split_B = imresps[shuffled_indices_B, 1, :]

    # Compute SRV for the shuffled data
    fraction_of_stimulus_variance, _ = compute_signal_related_variance(split_A, split_B)
    null_srv_all_neurons.append(fraction_of_stimulus_variance)

null_srv_all_neurons = np.array(null_srv_all_neurons)
null_srv_all_neurons.shape # (100, 15363)

print(null_srv_all_neurons[0])
print(null_srv_all_neurons[33])

# e.g. if neuron_index = 0, it will plot the SRV value for neuron 0 across all shuffles
neuron_index = 0
plt.hist([srv[neuron_index] for srv in null_srv_all_neurons], bins=100, color='blue', alpha=0.7)
plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
plt.ylabel("Number of Shuffles")
plt.title(f"Null Distribution of SRV for Neuron {neuron_index}")
plt.show()

In [None]:
### 2.2. Compute the real SRV for each neuron

# TODO: Question for Ali: why can't we just split like this?
# split_A_real = imresps[:, 0, :] # First repeat for each stimulus
# split_B_real = imresps[:, 1, :] # Second repeat for each stimulus

split_A, split_B = [], []
for responses in imresps: # responses shape: (2, n_neurons)
    indices = np.random.permutation(2) # Randomly shuffle [0, 1]
    split_A.append(responses[indices[0]]) # Assign one repeat to split_A
    split_B.append(responses[indices[1]]) # Assign the other to split_B

split_A = np.array(split_A)  # Shape: (n_stimuli, n_neurons)
split_B = np.array(split_B)  # Shape: (n_stimuli, n_neurons)

# Compute SRV for real data
real_srv_all_neurons, stim_to_noise_ratio = compute_signal_related_variance(split_A, split_B)

print(real_srv_all_neurons)
print(stim_to_noise_ratio)

print("Real SRV shape:", real_srv_all_neurons.shape) # Should be (15363,)

plt.hist(real_srv_all_neurons, bins=100, color='blue', alpha=0.7)
plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
plt.ylabel("Number of Shuffles")
plt.title(f"Null Distribution of SRV for Neuron {neuron_index}")
plt.show()

In [None]:
### 2.3. Filter neurons whose real SRV is in the top 90th percentile of its null distribution

# This gives the 90th-percentile SRV value of the null distribution for each neuron
# In other words the threshold for each neuron to be considered reliable
# e.g. if neuron 0 has a null distribution of SRVs across 10 shuffles 
# [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], the threshold would be 0.9
top_99th_percentile_null = np.percentile(null_srv_all_neurons, 99, axis=0)
print(top_99th_percentile_null) # [0.03651716 0.03126347 0.03325775 ... 0.02738261 0.03546677 0.0333109 ]

# Get indices of reliable neurons
reliable_neuron_indices = np.where(real_srv_all_neurons >= top_99th_percentile_null)[0]

# Print results
print(f"Number of reliable neurons: {len(reliable_neuron_indices)}") # 5654
print(f"Indices of reliable neurons: {reliable_neuron_indices}") # [   14    29    48 ... 15357 15358 15360]

plt.hist(real_srv_all_neurons, bins=100, color='red', alpha=0.7)
plt.hist(real_srv_all_neurons[reliable_neuron_indices], bins=100, color='blue', alpha=0.7)
plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
plt.ylabel("Number of Shuffles")
plt.title("All Neurons: SRV all vs. SRV reliable")
plt.show()

plt.hist(real_srv_all_neurons[reliable_neuron_indices], bins=100, color='blue', alpha=0.7)
plt.xlabel("Fraction of Stimulus-Related Variance (SRV)")
plt.ylabel("Number of Neurons")
plt.title("SRV Distribution for Reliable Neurons")
plt.show()

In [None]:
### 3.1. Load and preprocess images

import os
from scipy.io import loadmat
import matplotlib.pyplot as plt
import numpy as np
from torchvision.transforms import Normalize, Compose, Resize, CenterCrop
import torch
from torch.utils.data import TensorDataset
from torchvision import utils as torch_utils
 
PATH_TO_DATA = '../../data/selection1866'

file_list = sorted(f for f in os.listdir(PATH_TO_DATA) if f.endswith('.mat'))
stim_ids = stimids.astype(int)

print(stim_ids)
print(stimids)

# TODO: run tile 1 and 2 through model separately + concat feature reps (no crop, pad)
transform = Compose([
    Resize(96), # Resize shortest edge to 96 (cut off the rightmost part of the image)
    CenterCrop((96, 96)), # Crop to (96, 96)
    Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), # !! Normalize expects input is already in the range [0, 1]
])

img_tensors, labels = [], []

print('List:', file_list)

# we have 1866 images here, but the neural response data only uses 1573 of them
# because some ~300 images didn't have two repeats, so were disposed
# therefore we filter the full set here so that we only use the relevant 1573
for stim_id in stim_ids:
    filename = 'img' + str(stim_id) + '.mat'
    data = loadmat(os.path.join(PATH_TO_DATA, filename))

    img = data['img'][:, :500] # Take leftmost part of the image
    rgb_img = np.stack([img] * 3, axis=-1) # Convert grayscale to RGB for SimCLR
    tensor = torch.tensor(rgb_img, dtype=torch.float32).permute(2, 0, 1) # Shape (C, H, W)
    
    # Min-max scale the tensor to [0, 1]
    tensor_min = tensor.min()
    tensor_max = tensor.max()
    tensor = (tensor - tensor_min) / (tensor_max - tensor_min)

    # Clamp to [0, 1] to ensure no outliers due to numerical precision
    tensor = torch.clamp(tensor, 0.0, 1.0)

    transformed_tensor = transform(tensor) # Normalize and resize for SimCLR
    img_tensors.append(transformed_tensor)
    labels.append(stim_id)

image_dataset = TensorDataset(torch.stack(img_tensors), torch.tensor(labels))

images, labels = image_dataset.tensors
print("Processed image labels (stim id):", labels[:30])
print("Stim IDs from neural data:", stim_ids[:30])
print("Processed dataset shape:", images.shape) # (N, C, 96, 96)
print(f"Min pixel value (processed): {torch.min(images)}")
print(f"Max pixel value (processed): {torch.max(images)}")

# Show a sample of processed images
img_grid = torch_utils.make_grid(images[:12], nrow=6, normalize=True, pad_value=0.9)
img_grid = img_grid.permute(1, 2, 0).numpy()
plt.figure(figsize=(10, 5))
plt.title('Processed images: sample')
plt.imshow(img_grid)
plt.axis('off')
plt.show()
plt.close()

In [None]:
filename = 'img20.mat'
data = loadmat(os.path.join(PATH_TO_DATA, filename))
img = data['img'][:, :500]

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.imshow(img, cmap='gray')  # Adjust cmap as needed ('viridis', 'jet', etc.)
plt.colorbar(label="Pixel Intensity")
plt.title("Rendered Image")
plt.axis("off")  # Hide axis for better visualization
plt.show()

In [None]:
### 3.2. Run images through a pretrained SimCLR model and extract features

import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision
from tqdm.notebook import tqdm
from typing import Dict
from torch.utils.data import Dataset
import urllib.request
from urllib.error import HTTPError
from collections import defaultdict

class SimCLR(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()

        # Base ResNet18 backbone (pretrained=False, because we load custom weights later, from the SimCLR checkpoint file)
        self.convnet = torchvision.models.resnet18(pretrained=False)
        
        # This is the projection head, only needed during training. For downstream tasks it is disposed of
        # and the final linear layer output is used (Chen et al., 2020) 
        self.convnet.fc = nn.Sequential(
            nn.Linear(self.convnet.fc.in_features, 4 * hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(4 * hidden_dim, hidden_dim)
        )

        self.intermediate_layers_to_capture =[]
        self.intermediate_layer_features = {}
        self.num_workers = os.cpu_count()
        self.device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

    def load_pretrained(self):
        """
        Load pretrained SimCLR weights
        """
        base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial17/"
        models_dir = "../../models"
        pretrained_simclr_filename = "SimCLR.ckpt"
        pretrained_simclr_path = os.path.join(models_dir, pretrained_simclr_filename)
        os.makedirs(models_dir, exist_ok=True)

        # Check whether the pretrained model file already exists locally. If not, try downloading it
        file_url = base_url + pretrained_simclr_filename
        if not os.path.isfile(pretrained_simclr_path):
            print(f"Downloading pretrained SimCLR model {file_url}...")
            try:
                urllib.request.urlretrieve(file_url, pretrained_simclr_path)
            except HTTPError as e:
                print("Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", e)

        print(f"Already downloaded pretrained model: {file_url}")

        # Load pretrained model
        checkpoint = torch.load(pretrained_simclr_path, map_location=self.device)
        self.load_state_dict(checkpoint['state_dict'])
        self.to(self.device)
        self.eval()
    
    def set_intermediate_layers_to_capture(self, layers):
        """
        Register hooks to capture features from intermediate layers
        """
        # Just check the layers specified are actually in the convnet
        top_level_block_layers = [name for name, _ in self.convnet.named_children()]
        if not all(layer in top_level_block_layers for layer in layers):
            print('You have specified convnet layers that are not top-level blocks - make sure your layer names are valid')
        
        self.intermediate_layers_to_capture = layers
        intermediate_layer_features = {}

        def get_hook(layer_name):
            def hook(module, input, output):
                intermediate_layer_features[layer_name] = output.detach()
                # print(f"Hook stored {layer_name} | Mean activation: {output.mean().item():.6f}")
            return hook
        
        # def get_hook(layer_name):
        #     def hook(module, input, output):
        #         self.intermediate_layer_features[layer_name] = output.detach()
        #     return hook

        for layer_name in layers:
            layer = dict([*self.convnet.named_modules()])[layer_name]
            layer.register_forward_hook(get_hook(layer_name))

        self.intermediate_layer_features = intermediate_layer_features

    @torch.no_grad()
    def extract_features(self, dataset: Dataset) -> Dict[str, torch.Tensor]:
        """
        Run the pretrained SimCLR model on the image data, and capture features from final layer and intermediate layers.

        Args:
            dataset (Dataset): A PyTorch Dataset containing input images and labels. The image data should have shape (N, C, H, W)

        Returns:
            Dict[str, torch.Tensor]: A dictionary containing:
                - Intermediate layer features as tensors.
                - Final layer features under 'final_layer'.
                - Labels under 'labels'.
            Features from a given layer has shape (N, F) where N is num images, F is number of features - flattened version of (C, H, W).
        """
        self.convnet.fc = nn.Identity()  # Removing projection head g(.)
        self.eval()
        self.to(self.device)
        
        # Encode all images
        data_loader = DataLoader(dataset, batch_size=64, num_workers=self.num_workers, shuffle=False, drop_last=False)
        feats, labels, intermediate_features = [], [], {layer: [] for layer in self.intermediate_layers_to_capture}

        print("✅ Starting feature extraction...")
        print(f"Total batches: {len(data_loader)}")

        for batch_idx, (batch_imgs, batch_labels) in enumerate(tqdm(data_loader)):
            batch_imgs = batch_imgs.to(self.device)
            batch_feats = self.convnet(batch_imgs)
            
            feats.append(batch_feats.detach().cpu())
            labels.append(batch_labels)

            # Debugging log to check batch processing
            # print(f"🟡 Processing batch {batch_idx + 1}/{len(data_loader)} | Batch shape: {batch_imgs.shape}")
            
            # Collect intermediate layer outputs
            for layer in self.intermediate_layers_to_capture:
                # Final linear layer outputs a 2d tensor; but intermediate layers don't, so we flatten them (ready for PCA etc.)
                layer_output_flattened = self.intermediate_layer_features[layer].view(self.intermediate_layer_features[layer].size(0), -1) 
                
                # Check if features are stored correctly (first batch vs. later batches)
                # if batch_idx == 0:
                #     print(f"🔹 {layer} (First Batch) | Shape: {layer_output_flattened.shape}")

                intermediate_features[layer].append(layer_output_flattened.cpu())
        
        for layer in intermediate_features:
            print(f"🔎 {layer} | Stored {len(intermediate_features[layer])} batches before concatenation")

        # Concatenate results for each layer
        feats = torch.cat(feats, dim=0)
        labels = torch.cat(labels, dim=0)
        intermediate_features = {layer: torch.cat(intermediate_features[layer], dim=0) for layer in self.intermediate_layers_to_capture}

        # Debugging log after concatenation
        print("✅ Feature extraction complete. Final feature shapes:")
        print(f"Final layer: {feats.shape}")
        for layer, feature in intermediate_features.items():
            print(f"{layer}: {feature.shape}")  # Check final stored shape

        return {**intermediate_features, 'final_layer': feats, 'labels': labels}

intermediate_layers = ['layer1', 'layer2', 'layer3', 'layer4']

sim_clr = SimCLR()
sim_clr.load_pretrained()
sim_clr.set_intermediate_layers_to_capture(intermediate_layers)
feats = sim_clr.extract_features(image_dataset)

# Print captured feature shapes
# for layer in intermediate_layers:
#     if layer in feats:
#         print(f"{layer}: {feats[layer].shape}")
#     else:
#         print(f"{layer}: Not captured")

# for layer in intermediate_layers:
#     if layer in feats:
#         print(f"{layer} first feature sample:", feats[layer][0, :10].tolist())  # Print first 10 values

for layer in ["layer1", "layer2", "layer3", "layer4"]:
    if layer in feats:
        variance = np.var(feats[layer].numpy())
        print(f"{layer} variance: {variance:.6f}")

layer1_feats = feats['layer1'] # Shape: torch.Size([1573, 200704]) (n_images, n_features)
layer2_feats = feats['layer2']
layer3_feats = feats['layer3']
layer4_feats = feats['layer4']
final_layer_feats = feats['final_layer'] # Shape: torch.Size([1573, 512])

print('layer1 shape', layer1_feats.shape)
print('final layer shape', final_layer_feats.shape)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Compute feature-wise variance for each layer
feature_variances_layer1 = np.var(layer1_feats.numpy(), axis=0)
feature_variances_layer2 = np.var(layer2_feats.numpy(), axis=0)
feature_variances_layer3 = np.var(layer3_feats.numpy(), axis=0)
feature_variances_layer4 = np.var(layer4_feats.numpy(), axis=0)

# Plot histograms of feature variance distributions for each layer
plt.figure(figsize=(10, 6))
plt.hist(feature_variances_layer1, bins=50, alpha=0.5, label="Layer 1")
plt.hist(feature_variances_layer2, bins=50, alpha=0.5, label="Layer 2")
plt.hist(feature_variances_layer3, bins=50, alpha=0.5, label="Layer 3")
plt.hist(feature_variances_layer4, bins=50, alpha=0.5, label="Layer 4")

# Labels and legend
plt.xlabel("Feature Variance")
plt.ylabel("Count")
plt.title("Feature Variance Distribution Across Layers")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show plot
plt.show()


In [None]:
### 3.3. Apply PCA to SimCLR representations

from sklearn.decomposition import PCA

num_components = 500

# TODO note: cross-validation, and subsampling of each layer, is done in notebooks/003_experiment_images/experiment_images.ipynb
def run_pca(data, num_components=num_components):
    pca = PCA(n_components=num_components)
    pca.fit(data)
    explained_variance = pca.explained_variance_ratio_
    return np.cumsum(explained_variance), explained_variance

print("First 10 labels in SimCLR features:", labels[:10])

# Our original images are grayscale, but SimCLR expects 3-channel RGB input.
# To meet this requirement, we duplicated the grayscale values across all three RGB channels.
# However, for PCA, we only need a single channel, so we extract just the first channel (Red).
flattened_images = images[:, 0, :, :].view(images.shape[0], -1) # shape: [1573, 50176] (1573 images, 224x224 pixels)

cumulative_ev_raw, ev_raw = run_pca(flattened_images)
cumulative_ev_layer1, ev_layer1 = run_pca(layer1_feats)
cumulative_ev_layer2, ev_layer2 = run_pca(layer2_feats)
cumulative_ev_layer3, ev_layer3 = run_pca(layer3_feats)
cumulative_ev_layer4, ev_layer4 = run_pca(layer4_feats)
cumulative_ev_final_layer, ev_final_layer = run_pca(final_layer_feats, 500)

# Plot cumulative explained var vs. # principal components
plot_components = range(1, num_components + 1)
plt.figure(figsize=(10, 6))
plt.plot(plot_components, cumulative_ev_raw, label="Raw Images", marker='o')
plt.plot(plot_components, cumulative_ev_layer1, label="SimCLR Layer 1", marker='x')
plt.plot(plot_components, cumulative_ev_layer2, label="SimCLR Layer 2", marker='x')
plt.plot(plot_components, cumulative_ev_layer3, label="SimCLR Layer 3", marker='x')
plt.plot(plot_components, cumulative_ev_layer4, label="SimCLR Layer 4", marker='x')
plt.plot(plot_components, cumulative_ev_final_layer, label="SimCLR Layer Final Layer", marker='x')
# plt.plot(plot_components, cumulative_ev_final_layer, label="SimCLR Final Layer", marker='x')
plt.xlabel("# Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA: Cumulative Explained Variance vs. Number of Components")
plt.legend()
plt.text(
    0.5, -0.15,  # X and Y coordinates
    "Note: The output of the base encoder's final linear layer is the recommended representation for downstream tasks (Chen et al., 2020).",
    fontsize=10,
    color="gray",
    ha="center",
    va="top",
    transform=plt.gca().transAxes
)
plt.grid(True)
plt.show()

In [None]:
### Plot explained variance ratio (log) against number of principal components (log)

print(f"Min value: {cumulative_ev_final_layer.min()}, Max value: {cumulative_ev_final_layer.max()}")

plot_components = range(1, num_components + 1) # Do not apply np.log10 here
plt.figure(figsize=(10, 6))
plt.plot(plot_components, ev_raw, label="Raw Images", marker="x", linestyle="--", linewidth=2, color='blue')
plt.plot(plot_components, ev_final_layer, label="SimCLR Final Layer", marker="x", linestyle="-", linewidth=2, color='brown')
plt.plot(plot_components, ev_layer1, label="SimCLR Layer 1", marker="x", linestyle="-", linewidth=2, color='orange')
# plt.plot(plot_components, ev_layer2, label="SimCLR Layer 2", marker="x", linestyle="-", linewidth=2)
# plt.plot(plot_components, ev_layer3, label="SimCLR Layer 3", marker="x", linestyle="-", linewidth=2)

# Use log-log scaling for the axes
plt.xscale('log')
plt.yscale('log')
plt.xlabel("# Principal Components")
plt.ylabel("Explained Variance Ratio")
plt.title("PCA: Explained Variance Ratio vs. Number of Components")
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.legend(loc="best", fontsize="small", frameon=False)
plt.text(
    0.5, -0.15,  # X and Y coordinates
    "Note: The output of the base encoder's final linear layer is the recommended representation for downstream tasks (Chen et al., 2020).",
    fontsize=10,
    color="gray",
    ha="center",
    va="top",
    transform=plt.gca().transAxes
)
plt.tight_layout()
plt.show()

In [None]:
### 3.4. Choose principal components that explain a % of variance e.g. 90%.

# Ensure the number of features is not vastly greater than the number of neurons, to reduce overfitting.
# We have 5654 reliable neurons
def get_num_components_for_variance(cumulative_variance, target_variance=0.9):
    index_of_pc_reaching_target_var = np.argmax(cumulative_variance >= target_variance)
    
    if(index_of_pc_reaching_target_var == 0):
        print(f"Warning: The computed PCs do not cumulatively explain {target_variance * 100}% variance")
    
    return index_of_pc_reaching_target_var + 1

# Get the number of PCs for 90% variance
num_pcs_raw = get_num_components_for_variance(cumulative_ev_raw, target_variance=0.75)
num_pcs_layer1 = get_num_components_for_variance(cumulative_ev_layer1, target_variance=0.75)
num_pcs_layer2 = get_num_components_for_variance(cumulative_ev_layer2, target_variance=0.75)
num_pcs_layer3 = get_num_components_for_variance(cumulative_ev_layer3, target_variance=0.75)
num_pcs_layer4 = get_num_components_for_variance(cumulative_ev_layer4, target_variance=0.75)
num_pcs_final_layer = get_num_components_for_variance(cumulative_ev_final_layer, target_variance=0.75)

print(f"Number of PCs explaining 90% variance (Raw Images): {num_pcs_raw}")
print(f"Number of PCs explaining 90% variance (SimCLR Layer 1): {num_pcs_layer1}")
print(f"Number of PCs explaining 90% variance (SimCLR Layer 2): {num_pcs_layer2}")
print(f"Number of PCs explaining 90% variance (SimCLR Layer 3): {num_pcs_layer3}")
print(f"Number of PCs explaining 90% variance (SimCLR Layer 4): {num_pcs_layer4}")
print(f"Number of PCs explaining 90% variance (SimCLR Final Layer): {num_pcs_final_layer}")

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Function to compute number of PCs needed for 90% variance
def compute_pcs_for_variance(features, variance_threshold=0.90):
    pca = PCA()
    pca.fit(features)  # Fit PCA to the layer features
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)  # Compute cumulative variance explained
    num_pcs = np.searchsorted(cumulative_variance, variance_threshold) + 1  # Find first PC count above threshold
    return num_pcs

# Compute number of PCs needed for 90% variance for each layer
num_pcs_raw_images = compute_pcs_for_variance(flattened_images)
num_pcs_layer1 = compute_pcs_for_variance(layer1_feats)
num_pcs_layer2 = compute_pcs_for_variance(layer2_feats)
num_pcs_layer3 = compute_pcs_for_variance(layer3_feats)
num_pcs_layer4 = compute_pcs_for_variance(layer4_feats)
num_pcs_final_layer = compute_pcs_for_variance(final_layer_feats)

# Print results
print(f"Number of PCs needed to explain 90% variance:")
print(f"Raw images: {num_pcs_raw_images}")
print(f"Layer 1: {num_pcs_layer1}")
print(f"Layer 2: {num_pcs_layer2}")
print(f"Layer 3: {num_pcs_layer3}")
print(f"Layer 4: {num_pcs_layer4}")
print(f"Final layer: {num_pcs_final_layer}")

# Store the number of PCs needed for each layer in a dictionary
num_pcs_dict = {
    "Raw Images": num_pcs_raw_images,
    "Layer 1": num_pcs_layer1,
    "Layer 2": num_pcs_layer2,
    "Layer 3": num_pcs_layer3,
    "Layer 4": num_pcs_layer4,
    "Final Layer": num_pcs_final_layer,
}

print(num_pcs_dict)

# Plot as a bar graph
plt.figure(figsize=(8, 5))
plt.bar(num_pcs_dict.keys(), num_pcs_dict.values(), color=['blue', 'orange', 'green', 'red', 'purple', 'brown'])

# Labels and title
plt.xlabel("SimCLR Layer (Including Raw Images)")
plt.ylabel("Number of PCs Needed for 90% Variance")
plt.title("Number of Principal Components Needed per Layer")
plt.xticks(rotation=30)  # Rotate x-axis labels for readability
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Show the plot
plt.show()

In [16]:
### 3.4. Project the data into PCA space

# Our original images are grayscale, but SimCLR expects 3-channel RGB input.
# To meet this requirement, we duplicated the grayscale values across all three RGB channels.
# However, for PCA, we only need a single channel, so we extract just the first channel (Red).
images.shape # shape: [1573, 3, 224, 224]
flattened_images = images[:, 0, :, :].view(images.shape[0], -1) # shape: [1573, 50176] (1573 images, 224x224 pixels)

# Use same number of principal components for each input
num_pcs = min(num_pcs_raw, num_pcs_layer2, num_pcs_final_layer)

## 3.4.1. For raw images
pca_raw = PCA(n_components=num_pcs)
pca_raw.fit(flattened_images)
raw_image_pcs = pca_raw.transform(flattened_images) # Shape: (1573, N) -> 1573 images, N PCs

pca_layer1 = PCA(n_components=num_pcs)
pca_layer1.fit(layer1_feats)
layer1_pcs = pca_layer1.transform(layer1_feats)

## 3.4.2. For layer-2 activations
pca_layer2 = PCA(n_components=num_pcs)
pca_layer2.fit(layer2_feats)
layer2_pcs = pca_layer2.transform(layer2_feats) # Shape: (1573, 1163) -> 1573 images, 1163 PCs

pca_layer3 = PCA(n_components=num_pcs)
pca_layer3.fit(layer3_feats)
layer3_pcs = pca_layer3.transform(layer3_feats)

pca_layer4 = PCA(n_components=num_pcs)
pca_layer4.fit(layer4_feats)
layer4_pcs = pca_layer4.transform(layer4_feats)
layer4_pcs = pca_layer4.transform(layer4_feats)

## 3.4.3. For final layer activations
pca_final_layer = PCA(n_components=num_pcs)
pca_final_layer.fit(final_layer_feats)
final_layer_pcs = pca_final_layer.transform(final_layer_feats)

In [None]:
### 4.1. Regression per neuron

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Gather the neural responses for the reliable neurons
# we take the average across repeats for each neuron
neural_responses = imresps[:, :, reliable_neuron_indices] # Shape: (1573, 2, 5654)
neural_responses_mean = neural_responses.mean(axis=1) # Shape: (1573, 5654) -> 1573 images, 5654 neurons

# Select a specific reliable neuron by index (because we will have a regression model per neuron)
reliable_neuron_index = reliable_neuron_indices[0]
mean_response_single_neuron = neural_responses_mean[:, reliable_neuron_index]  # Shape: (1573,)

## 4.1.1. For raw images

print('Raw image PCs shape', raw_image_pcs.shape) # (1573, 526)
print('Mean response single neuron shape', mean_response_single_neuron.shape) # (1573,)

# Fit a single linear regression model (handles all neurons at once)
X_train, X_test, Y_train, Y_test = train_test_split(
    raw_image_pcs, neural_responses_mean, test_size=0.2, random_state=42
)
reg = LinearRegression()
reg.fit(X_train, Y_train)
Y_pred = reg.predict(X_test)
raw_image_r2_scores = r2_score(Y_test, Y_pred, multioutput='raw_values')

for neuron_index, score in zip(reliable_neuron_indices[:10], raw_image_r2_scores):
    print(f"Raw Images: R^2 Score for Reliable Neuron {neuron_index}: {score}")

## 4.1.2. For SimCLR layer 2 features
X_train, X_test, Y_train, Y_test = train_test_split(
    layer2_pcs, neural_responses_mean, test_size=0.2, random_state=42
)
reg = LinearRegression()
reg.fit(X_train, Y_train)
Y_pred = reg.predict(X_test)
layer_2_r2_scores = r2_score(Y_test, Y_pred, multioutput='raw_values')

for neuron_index, score in zip(reliable_neuron_indices[:10], layer_2_r2_scores):
    print(f"Layer 2: R^2 Score for Reliable Neuron {neuron_index}: {score}")

## 4.1.3. For SimCLR final layer features
X_train, X_test, Y_train, Y_test = train_test_split(
    final_layer_pcs, neural_responses_mean, test_size=0.2, random_state=42
)
reg = LinearRegression()
reg.fit(X_train, Y_train)
Y_pred = reg.predict(X_test)
final_layer_r2_scores = r2_score(Y_test, Y_pred, multioutput='raw_values')

for neuron_index, score in zip(reliable_neuron_indices[:10], final_layer_r2_scores):
    print(f"Final Layer: R^2 Score for Reliable Neuron {neuron_index}: {score}")


In [None]:
### Plot overlaid histogram of distribution of regression scores across reliable neurons, for different layers

# Fit a single linear regression model (handles all neurons at once)
def linear_regression(image_representations, neural_responses):
    X_train, X_test, Y_train, Y_test = train_test_split(
        image_representations, neural_responses, test_size=0.2, random_state=42
    )
    reg = LinearRegression()
    reg.fit(X_train, Y_train)
    Y_pred = reg.predict(X_test)
    r2_scores = r2_score(Y_test, Y_pred, multioutput='raw_values')
    return r2_scores

raw_image_pcs_scores = linear_regression(raw_image_pcs, neural_responses_mean)
layer1_pcs_scores = linear_regression(layer1_pcs, neural_responses_mean)
layer2_pcs_scores = linear_regression(layer2_pcs, neural_responses_mean)
layer3_pcs_scores = linear_regression(layer3_pcs, neural_responses_mean)
layer4_pcs_scores = linear_regression(layer4_pcs, neural_responses_mean)
final_layer_pcs_scores = linear_regression(final_layer_pcs, neural_responses_mean)

plt.hist(raw_image_pcs_scores, bins=50, alpha=0.5, label='Raw Images', color='red')
# plt.hist(layer1_pcs_scores, bins=50, alpha=0.5, label='SimCLR Layer 1', color='pink')
# plt.hist(layer2_pcs_scores, bins=50, alpha=0.5, label='SimCLR Layer 2', color='yellow')
# plt.hist(layer3_pcs_scores, bins=50, alpha=0.5, label='SimCLR Layer 3', color='orange')
# plt.hist(layer4_pcs_scores, bins=50, alpha=0.5, label='SimCLR Layer 4', color='blue')
# plt.hist(final_layer_pcs_scores, bins=50, alpha=0.5, label='SimCLR Final Layer', color='green')

plt.xlabel("R² Score")
plt.ylabel("Number of Neurons")
plt.title("Distribution of Regression Scores Across Reliable Neurons")
plt.legend()
plt.show()

In [None]:
### 5.1. Ridge regression

from sklearn.linear_model import Ridge

# TODO: tune reg param for Ridge
# TODO: max val score by iterating alpha and test on test set

## 5.1.1. For raw images
X_train, X_test, y_train, y_test = train_test_split(raw_image_pcs, neural_responses_mean, test_size=0.2, random_state=42)
reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train)
Y_pred = reg.predict(X_test)
raw_images_r2_scores = r2_score(Y_test, Y_pred, multioutput='raw_values')

for neuron_index, score in zip(reliable_neuron_indices[:10], final_layer_r2_scores):
    print(f"Raw Images (Ridge): R^2 Score for Reliable Neuron {neuron_index}: {score}")

## 5.1.2. For SimCLR layer 2
X_train, X_test, y_train, y_test = train_test_split(layer2_pcs, neural_responses_mean, test_size=0.2, random_state=42)
reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train)
Y_pred = reg.predict(X_test)
layer_2_r2_scores = r2_score(Y_test, Y_pred, multioutput='raw_values')

for neuron_index, score in zip(reliable_neuron_indices[:10], final_layer_r2_scores):
    print(f"Layer 2 (Ridge): R^2 Score for Reliable Neuron {neuron_index}: {score}")

## 5.1.3. For SimCLR final layer
X_train, X_test, y_train, y_test = train_test_split(final_layer_pcs, neural_responses_mean, test_size=0.2, random_state=42)
reg = Ridge(alpha=1.0)
reg.fit(X_train, y_train)
Y_pred = reg.predict(X_test)
final_layer_r2_scores = r2_score(Y_test, Y_pred, multioutput='raw_values')

for neuron_index, score in zip(reliable_neuron_indices[:10], final_layer_r2_scores):
    print(f"Final Layer (Ridge): R^2 Score for Reliable Neuron {neuron_index}: {score}")

In [None]:
import matplotlib.pyplot as plt

# Number of PCs to visualize
num_pcs = 10

# Create a grid for visualization
fig, axes = plt.subplots(1, num_pcs, figsize=(12, 4))

for i, ax in enumerate(axes):
    pc_image = pca_raw.components_[i].reshape(images.shape[2], images.shape[3])  # Reshape to image dimensions
    ax.imshow(pc_image, cmap="gray")
    ax.set_title(f"PC {i+1}")
    ax.axis("off")  # Hide axis for cleaner visualization

# Adjust layout
plt.suptitle("First 5 Principal Components as Images", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import scipy.stats as stats

## Visualise the principal components (project the image into PC space)
print('PC shape:', pca_raw.components_.shape) # (32, 50176)
print('images shape:', images.shape) # (1573, 3, 224, 224)

# Visualize the first 5 PCs as images
for i in range(5):
    pc_image = pca_raw.components_[i].reshape(images.shape[2], images.shape[3])  # Reshape to image dimensions
    plt.figure(figsize=(4, 4))
    plt.imshow(pc_image, cmap="gray")
    plt.title(f"Principal Component {i+1}")
    plt.colorbar()
    plt.show()

plt.figure(figsize=(10, 4))
sns.heatmap(pca_layer2.components_[:5], cmap="coolwarm", center=0)
plt.title("First 5 Principal Components - Layer 2 Features")
plt.xlabel("Feature Index")
plt.ylabel("PC Index")
plt.show()

# raw_image_pcs[i, j] → The score of the i-th image for the j-th principal component
# e.g. how much image i is represented by a PC 
# e.g. if a PC captures "vertical edges", an image with strong vertical edges will have a high score for that component
print(raw_image_pcs[0, 0])

# Compute correlation between each PC and neural responses
pc_index = 0  # Change this to test different PCs
correlations = np.corrcoef(raw_image_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]

# Plot histogram of correlations
plt.figure(figsize=(6, 4))
sns.histplot(correlations, bins=30, kde=True)
plt.xlabel(f"Raw Images: Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Histogram of PC {pc_index+1} correlation with neurons")
plt.show()

# Identify top 5 neurons most correlated with the first PC
top_neurons = np.argsort(np.abs(correlations))[-5:]

# Scatter plot for each top neuron
for neuron_id in top_neurons:
    plt.figure(figsize=(5, 4))
    plt.scatter(raw_image_pcs[:, 0], neural_responses_mean[:, neuron_id], alpha=0.5)
    slope, intercept, r_value, _, _ = stats.linregress(raw_image_pcs[:, 0], neural_responses_mean[:, neuron_id])
    plt.plot(raw_image_pcs[:, 0], slope * raw_image_pcs[:, 0] + intercept, color="red", label=f"R={r_value:.2f}")
    plt.xlabel("PC 1 Score")
    plt.ylabel(f"Neuron {neuron_id} Response")
    plt.legend()
    plt.title(f"Neuron {neuron_id}: PC 1 vs Neural Response")
    plt.show()

pc_index = 0  # Change this to test different PCs
correlations = np.corrcoef(layer2_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]

# Plot histogram of correlations
plt.figure(figsize=(6, 4))
sns.histplot(correlations, bins=30, kde=True)
plt.xlabel(f"Layer 2: Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Histogram of PC {pc_index+1} correlation with neurons")
plt.show()

# Scatter plot for each top neuron
for neuron_id in top_neurons:
    plt.figure(figsize=(5, 4))
    plt.scatter(layer2_pcs[:, 0], neural_responses_mean[:, neuron_id], alpha=0.5)
    slope, intercept, r_value, _, _ = stats.linregress(layer2_pcs[:, 0], neural_responses_mean[:, neuron_id])
    plt.plot(layer2_pcs[:, 0], slope * layer2_pcs[:, 0] + intercept, color="red", label=f"R={r_value:.2f}")
    plt.xlabel("PC 1 Score")
    plt.ylabel(f"Neuron {neuron_id} Response")
    plt.legend()
    plt.title(f"Neuron {neuron_id}: PC 1 vs Neural Response")
    plt.show()

pc_index = 0  # Change this to test different PCs
correlations = np.corrcoef(final_layer_feats[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]

# Plot histogram of correlations
plt.figure(figsize=(6, 4))
sns.histplot(correlations, bins=30, kde=True)
plt.xlabel(f"Final Layer: Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Histogram of PC {pc_index+1} correlation with neurons")
plt.show()

In [None]:
# Compute correlation between each PC and neural responses
pc_index = 10  # Change this to test different PCs

# Raw images
raw_images_correlations = np.corrcoef(raw_image_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]
plt.figure(figsize=(6, 4))
sns.histplot(raw_images_correlations, bins=30, kde=True)
plt.xlabel(f"Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Raw Images: PC {pc_index+1} correlation with neurons")
plt.show()

# Layer 1
layer_1_correlations = np.corrcoef(layer1_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]
plt.figure(figsize=(6, 4))
sns.histplot(layer_1_correlations, bins=30, kde=True)
plt.xlabel(f"Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Layer 1: PC {pc_index+1} correlation with neurons")
plt.show()

# Layer 2
layer_2_correlations = np.corrcoef(layer2_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]
plt.figure(figsize=(6, 4))
sns.histplot(layer_2_correlations, bins=30, kde=True)
plt.xlabel(f"Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Layer 2: PC {pc_index+1} correlation with neurons")
plt.show()

# Layer 3
layer_3_correlations = np.corrcoef(layer3_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]
plt.figure(figsize=(6, 4))
sns.histplot(layer_3_correlations, bins=30, kde=True)
plt.xlabel(f"Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Layer 3: PC {pc_index+1} correlation with neurons")
plt.show()

# Layer 4
layer_4_correlations = np.corrcoef(layer4_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]
plt.figure(figsize=(6, 4))
sns.histplot(layer_4_correlations, bins=30, kde=True)
plt.xlabel(f"Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Layer 4: PC {pc_index+1} correlation with neurons")
plt.show()

# Final Layer
final_layer_correlations = np.corrcoef(final_layer_pcs[:, pc_index], neural_responses_mean, rowvar=False)[0, 1:]
plt.figure(figsize=(6, 4))
sns.histplot(final_layer_correlations, bins=30, kde=True)
plt.xlabel(f"Correlation between PC {pc_index+1} and neural responses")
plt.ylabel("Number of neurons")
plt.title(f"Final Layer: PC {pc_index+1} correlation with neurons")
plt.show()

sns.histplot(raw_images_correlations, bins=30, kde=True, label="Raw Images", color="blue", alpha=0.5)
sns.histplot(layer_2_correlations, bins=30, kde=True, label="Layer 2", color="orange", alpha=0.5)
sns.histplot(final_layer_correlations, bins=30, kde=True, label="Final Layer", color="red", alpha=0.5)
plt.legend()
plt.xlabel("Correlation with Neural Responses")
plt.ylabel("Number of Neurons")
plt.title(f"Comparison of PC {3+1} Correlations Across Representations")
plt.show()

pc_indices = range(10)  # First 10 PCs
avg_correlations = [np.mean(np.abs(np.corrcoef(raw_image_pcs[:, i], neural_responses_mean, rowvar=False)[0, 1:])) for i in pc_indices]

plt.plot(pc_indices, avg_correlations, marker='o', linestyle='-')
plt.xlabel("PC Index")
plt.ylabel("Average |Correlation| with Neural Responses")
plt.title("How Neural Response Correlation Varies Across PCs")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Create random noise features with the same shape as SimCLR features
random_features = np.random.randn(*layer4_feats.shape)

# Train regression on SimCLR features
X_train, X_test, Y_train, Y_test = train_test_split(layer4_feats, neural_responses_mean, test_size=0.2, random_state=42, shuffle=False)
reg = LinearRegression()
reg.fit(X_train, Y_train)
r2_simclr = reg.score(X_test, Y_test)

# Train regression on random noise features
X_train, X_test, Y_train, Y_test = train_test_split(random_features, neural_responses_mean, test_size=0.2, random_state=42)
reg.fit(X_train, Y_train)
r2_random = reg.score(X_test, Y_test)

print(f"R² using SimCLR: {r2_simclr:.4f}")
print(f"R² using Random Noise: {r2_random:.4f}")

import numpy as np
from scipy.stats import pearsonr

# Compute correlation between SimCLR features and neural responses
correlations = []
for neuron_idx in range(neural_responses_mean.shape[1]):  # Loop over neurons
    corr, _ = pearsonr(layer4_feats[:, 0], neural_responses_mean[:, neuron_idx])  # Correlate 1st PC
    correlations.append(corr)

# Plot histogram of correlations
import matplotlib.pyplot as plt
plt.hist(correlations, bins=50, color='blue', alpha=0.7)
plt.xlabel("Pearson Correlation")
plt.ylabel("Number of Neurons")
plt.title("Correlation of SimCLR Features with Neural Responses")
plt.show()

In [None]:
### Regression using 10 PCs of raw images, and 500 most reliable neurons

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt

image_representation = flattened_images # [flattened_images, layer1_feats, layer2_feats, layer3_feats, layer4_feats, final_layer_feats]
num_pcs = 10
num_neurons = 500
reduce_dimensionality = False

# ===================================
# Filter only the top Y neurons (SRV)
# ===================================
reliable_srv_scores = real_srv_all_neurons[reliable_neuron_indices]
sorted_indices = np.argsort(reliable_srv_scores)[::-1]
most_reliable_neurons = reliable_neuron_indices[sorted_indices[:num_neurons]]
highest_srv_scores = real_srv_all_neurons[most_reliable_neurons]
neural_responses = imresps[:, :, most_reliable_neurons]
neural_responses_mean = neural_responses.mean(axis=1)

assert most_reliable_neurons.shape[0] == num_neurons, "Mismatch in neuron selection!"
print("Dimensionality of neural responses:", neural_responses_mean.shape)
print("Top 500 reliable neuron indices:", most_reliable_neurons[:10])
print("Corresponding SRV scores:", highest_srv_scores[:10])
print("Top 500 neural responses shape:", neural_responses.shape) # (1573, 2, 500)
print("Averaged top 500 neural responses shape:", neural_responses_mean.shape) # (1573, 500)

# ======================
# Get test-train split
# ======================
X_train, X_test, Y_train, Y_test = train_test_split(image_representation, neural_responses_mean, test_size=0.2, random_state=42, shuffle=False)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42, shuffle=False)

# ===========================================================
# Apply PCA (only on training set).
# Then transform validation and test sets using trained PCA
# ===========================================================
if reduce_dimensionality is True:
    print("[INFO] Applying PCA to image representation (only on training set)!")
    
    pca = PCA(n_components=num_pcs)
    X_train = pca.fit_transform(X_train)
    X_val = pca.transform(X_val)
    X_test = pca.transform(X_test)

    print(f"[INFO] PCA reduced image representation to {num_pcs} dimensions!")
else:
    print(f"[INFO] No PCA applied")

print(f"Raw image representation shape: {image_representation.shape}")
print(f"Image representation shape: Train={X_train.shape}, Val={X_val.shape}, Test={X_test.shape}")

# ===================================================
# Run ridge regressions, tune regularisation param
# ===================================================
alphas = np.logspace(-1, 5, num=7)  # [0.1, 1, 10, 100, 1000, 10000, 100000]
train_scores, test_scores, val_scores = [], [], []
best_alpha = None
best_model = None
best_val_r2 = float('-inf')

for alpha in alphas:
    reg = Ridge(alpha=alpha)
    reg.fit(X_train, Y_train)

    train_r2 = r2_score(Y_train, reg.predict(X_train))
    test_r2 = r2_score(Y_test, reg.predict(X_test))
    val_r2 = r2_score(Y_val, reg.predict(X_val))

    print(f"Alpha: {alpha}, Train R²: {train_r2:.4f}, Val R²: {val_r2:.4f}, Test R²: {test_r2:.4f}")

    train_scores.append(train_r2)
    test_scores.append(test_r2)
    val_scores.append(val_r2)

    if val_r2 > best_val_r2:
        best_val_r2 = val_r2
        best_alpha = alpha
        best_model = reg

if best_model is None:
    print("[WARNING]: No best model was found. Check your data or hyperparameters.")
    best_model = Ridge(alpha=best_alpha).fit(X_train, Y_train)

print(f"\nBest α: {best_alpha} with Test R²: {best_val_r2:.4f}")

# ===================================
# Visualisation
# ===================================
plt.plot(alphas, train_scores, marker='o', label='Train R²')
plt.plot(alphas, val_scores, marker='s', label='Validation R²')
plt.xscale('log')
plt.xlabel("Ridge Regularization (α)")
plt.ylabel("R² Score")
plt.title("Effect of Ridge Regularization on Model Performance")
plt.legend()
plt.show()

Y_pred = best_model.predict(X_test)
plt.figure(figsize=(5, 5))
plt.scatter(Y_test.flatten(), Y_pred.flatten(), alpha=0.5)
plt.plot([-2, 2], [-2, 2], 'r--', label="Ideal Fit")
plt.xlabel("True Neural Response")
plt.ylabel("Predicted Neural Response")
plt.title(f"Predictions vs. Actual Neural Responses (α={best_alpha})")
plt.legend()
plt.show()

In [None]:
### Plot regression scores for a particular layer, for varying number of PCs

import matplotlib.pyplot as plt
import numpy as np

image_representation = flattened_images # [flattened_images, layer1_feats, layer2_feats, layer3_feats, layer4_feats, final_layer_feats]
alpha = 100000
pcs_90pc_var = 372 # {'Raw Images': 372, 'Layer 1': 1016, 'Layer 2': 909, 'Layer 3': 701, 'Layer 4': 519, 'Final Layer': 119}
num_neurons = 500

# ===================================
# Filter only the top Y neurons (SRV)
# ===================================
reliable_srv_scores = real_srv_all_neurons[reliable_neuron_indices]
sorted_indices = np.argsort(reliable_srv_scores)[::-1]
most_reliable_neurons = reliable_neuron_indices[sorted_indices[:num_neurons]]
highest_srv_scores = real_srv_all_neurons[most_reliable_neurons]
neural_responses = imresps[:, :, most_reliable_neurons]
neural_responses_mean = neural_responses.mean(axis=1)

assert most_reliable_neurons.shape[0] == num_neurons, "Mismatch in neuron selection!"
print("Dimensionality of neural responses:", neural_responses_mean.shape)
print("Top 500 reliable neuron indices:", most_reliable_neurons[:10])
print("Corresponding SRV scores:", highest_srv_scores[:10])
print("Top 500 neural responses shape:", neural_responses.shape) # (1573, 2, 500)
print("Averaged top 500 neural responses shape:", neural_responses_mean.shape) # (1573, 500)

# ======================
# Get test-train split
# ======================
X_train, X_test, Y_train, Y_test = train_test_split(image_representation, neural_responses_mean, test_size=0.2, random_state=42, shuffle=False)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42, shuffle=False)

pcs_values = [10, 50, 100, 200, 500]
train_r2_scores = []
val_r2_scores = []
test_r2_scores = []

for num_pcs in pcs_values:
    pca = PCA(n_components=num_pcs)
    
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    X_test_pca = pca.transform(X_test)
    
    reg = Ridge(alpha=alpha)
    reg.fit(X_train_pca, Y_train)

    train_r2 = r2_score(Y_train, reg.predict(X_train_pca))
    val_r2 = r2_score(Y_val, reg.predict(X_val_pca))
    test_r2 = r2_score(Y_test, reg.predict(X_test_pca))

    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    test_r2_scores.append(test_r2)

    print(f"PCs: {num_pcs}, Train R²: {train_r2:.4f}, Val R²: {val_r2:.4f}, Test R²: {test_r2:.4f}")

# Plot the results
plt.figure(figsize=(6, 4))
plt.plot(pcs_values, train_r2_scores, marker='o', linestyle='--', label='Train R²')
plt.plot(pcs_values, val_r2_scores, marker='s', linestyle='-', label='Validation R²')
plt.plot(pcs_values, test_r2_scores, marker='^', linestyle='-', label='Test R²')

plt.xlabel("Number of Principal Components")
plt.ylabel("R² Score")
plt.axvline(x=pcs_90pc_var, color='red', linestyle='--', label=f"90% Variance ({pcs_90pc_var} PCs)")
plt.title("Effect of PCA on Ridge Regression Performance")
plt.legend()
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')  # Reference line at R² = 0
plt.show()

In [None]:
# Compare regression performance across image representations

import matplotlib.pyplot as plt
import numpy as np

# Define the feature types
feature_types = ["Raw Pixels", "SimCLR Layer 1", "SimCLR Layer 2", "SimCLR Layer 3", "SimCLR Layer 4", "Final SimCLR Layer"]
train_r2_scores, val_r2_scores, test_r2_scores = [], [], []

for feature in feature_types:
    # Select the feature representation
    if feature == "Raw Pixels":
        X_feature = flattened_images
    elif feature == "SimCLR Layer 1":
        X_feature = layer1_feats
    elif feature == "SimCLR Layer 2":
        X_feature = layer2_feats
    elif feature == "SimCLR Layer 3":
        X_feature = layer3_feats
    elif feature == "SimCLR Layer 4":
        X_feature = layer4_feats
    elif feature == "Final SimCLR Layer":
        X_feature = final_layer_feats

    # Split into train, val, test
    X_train, X_test, Y_train, Y_test = train_test_split(X_feature, neural_responses_mean, test_size=0.2, random_state=42, shuffle=False)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42, shuffle=False)

    # Apply PCA if needed
    num_pcs = 100  # Adjust based on previous PCA analysis
    pca = PCA(n_components=num_pcs)
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    X_test_pca = pca.transform(X_test)

    # Train Ridge Regression
    reg = Ridge(alpha=100000)
    reg.fit(X_train_pca, Y_train)

    # Compute R² scores
    train_r2 = r2_score(Y_train, reg.predict(X_train_pca))
    val_r2 = r2_score(Y_val, reg.predict(X_val_pca))
    test_r2 = r2_score(Y_test, reg.predict(X_test_pca))

    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    test_r2_scores.append(test_r2)

    print(f"{feature}: Train R²: {train_r2:.4f}, Val R²: {val_r2:.4f}, Test R²: {test_r2:.4f}")

# Plot bar chart
x = np.arange(len(feature_types))
width = 0.3  # Width of bars

plt.figure(figsize=(8, 5))
plt.bar(x - width, train_r2_scores, width=width, label='Train R²', color='blue', alpha=0.7)
plt.bar(x, val_r2_scores, width=width, label='Validation R²', color='orange', alpha=0.7)
plt.bar(x + width, test_r2_scores, width=width, label='Test R²', color='green', alpha=0.7)

plt.xticks(x, feature_types, rotation=45, ha='right')
plt.ylabel("R² Score")
plt.title("Comparison of Ridge Regression Performance Across Feature Types")
plt.legend()
plt.show()

In [None]:
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

X_feature = layer3_feats 
Y_target = neural_responses_mean

X_train, X_test, Y_train, Y_test = train_test_split(X_feature, Y_target, test_size=0.2, random_state=42, shuffle=False)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42, shuffle=False)

num_pcs = 10
pca_x = PCA(n_components=num_pcs)
X_train_pca = pca_x.fit_transform(X_train)
X_val_pca = pca_x.transform(X_val)
X_test_pca = pca_x.transform(X_test)

Y_train_single = Y_train[:, 0]
Y_val_single = Y_val[:, 0]
Y_test_single = Y_test[:, 0]

svr = SVR(kernel='rbf', C=10, epsilon=0.1) # C and epsilon can be tuned
svr.fit(X_train_pca, Y_train_single)

train_r2_single = r2_score(Y_train_single, svr.predict(X_train_pca))
val_r2_single = r2_score(Y_val_single, svr.predict(X_val_pca))
test_r2_single = r2_score(Y_test_single, svr.predict(X_test_pca))

print(f" SVR (Single Neuron) Results:")
print(f"Train R²: {train_r2_single:.4f}, Val R²: {val_r2_single:.4f}, Test R²: {test_r2_single:.4f}")

pca_y = PCA(n_components=1)
Y_train_pca = pca_y.fit_transform(Y_train)
Y_val_pca = pca_y.transform(Y_val)
Y_test_pca = pca_y.transform(Y_test)
svr.fit(X_train_pca, Y_train_pca.ravel())
train_r2_pc = r2_score(Y_train_pca, svr.predict(X_train_pca))
val_r2_pc = r2_score(Y_val_pca, svr.predict(X_val_pca))
test_r2_pc = r2_score(Y_test_pca, svr.predict(X_test_pca))

print(f"\n SVR (First PC of Neural Responses) Results:")
print(f"Train R²: {train_r2_pc:.4f}, Val R²: {val_r2_pc:.4f}, Test R²: {test_r2_pc:.4f}")

models = ["Ridge (α=100K)", "SVR (Neuron 1)", "SVR (First PC)"]
test_r2_scores = [0.0240, test_r2_single, test_r2_pc]

plt.figure(figsize=(6, 4))
plt.bar(models, test_r2_scores, color=['blue', 'green', 'purple'])
plt.ylabel("Test R² Score")
plt.title("Regression Performance: Ridge vs. SVR on SimCLR Layer 3")
plt.ylim(-0.05, max(test_r2_scores) + 0.01)
plt.show()