In [None]:
from google.colab import drive
drive.mount("/content/drive")
dir_path = "/content/drive/MyDrive/Masterthesis/Datasets/mnist"


Mounted at /content/drive


In [None]:
!pip install torchextractor

Collecting torchextractor
  Downloading torchextractor-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Downloading torchextractor-0.3.0-py3-none-any.whl (10 kB)
Installing collected packages: torchextractor
Successfully installed torchextractor-0.3.0


In [None]:
import torch
import sys
import numpy as np
import pickle as pkl
from os.path import join as oj
from datetime import datetime
import torch.optim as optim
import os
from torch.utils.data import TensorDataset, ConcatDataset
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, f1_score
import argparse
from PIL import Image
from tqdm import tqdm
from torch import nn
from numpy.random import randint
import torchvision.models as models
import time
import copy
import gc
import json
import pandas as pd
import torch
import torchvision.models as models
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split
from PIL import Image
import torch.nn as nn
import matplotlib.pyplot as plt
import time
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.optim.lr_scheduler import StepLR
from scipy.stats import ttest_1samp
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import os
import gc
from sklearn.metrics import accuracy_score


### Loading fine tuned alexnet

In [None]:
class MnistDataset(Dataset):
    def __init__(self, path: str = None, is_two: int = None, data_files=None, labels=None, transform=None):
        self.resize_shape = (64, 64)  # Target shape for resizing images
        self.transform = transform or transforms.Compose([
            transforms.Resize(self.resize_shape),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize for RGB images
        ])

        if path:
            self.path = path
            self.data_files = [f for f in os.listdir(self.path) if f.endswith(('.jpg', '.png'))]
            if len(self.data_files) == 0:
                raise ValueError(f"No valid image files found in the provided path: {self.path}")

            self.is_two = is_two
            self.labels = [is_two] * len(self.data_files) if is_two is not None else labels
        else:
            self.path = ''
            if data_files is None or len(data_files) == 0:
                raise ValueError("data_files must be a non-empty list of file paths.")

            self.data_files = data_files
            self.labels = labels
            self.is_two = is_two

        if self.labels is not None and len(self.labels) != len(self.data_files):
            raise ValueError("Mismatch between the number of labels and data files.")

    def __getitem__(self, i):
        try:
            img_path = os.path.join(self.path, self.data_files[i]) if self.path else self.data_files[i]
            img = Image.open(img_path).convert("RGB") #!!!!!
            if self.transform:
                img = self.transform(img)

            is_two = self.is_two if self.is_two is not None else self.labels[i]
            return img, is_two  # Exclude group_label
        except Exception as e:
            print(f"Error processing index {i}: {e}")
            return None

    def __len__(self):
        return len(self.data_files)


In [None]:
mean = np.asarray([0.485, 0.456, 0.406])
std = np.asarray([0.229, 0.224, 0.225])


# Function to load the trained model
def load_model(model_path, device):
    print(f"Loading model from {model_path}")
    model = models.alexnet(pretrained=False)
    model.classifier[-1] = nn.Linear(4096, 2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    print("Model loaded successfully")
    return model





### Extract Alexnet training fc2 activations

In [None]:
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform=None, resize_shape=(64, 64)):
        print(f"Initializing dataset with {len(image_paths)} images")
        self.image_paths = image_paths
        self.resize_shape = resize_shape  # Set the resize shape
        self.transform = transform or transforms.Compose([
            transforms.Resize(self.resize_shape),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize for RGB images
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        print(f"Loading image: {image_path}")
        image = Image.open(image_path).convert("RGB") #!!!!!!!!!!!
        if self.transform:
            image = self.transform(image)
        return image


# Preprocessing function
preprocess = transforms.Compose([
    #transforms.Grayscale(num_output_channels=3),
    transforms.Resize((64, 64)),  # Match the resize shape in MnistDataset
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Match the normalization values in MnistDataset
])



In [None]:
def load_model(model_path):
    print(f"Loading model from {model_path}")
    model = models.alexnet(pretrained=True)
    #model = AlexNet().to(device)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #model.fc3 = nn.Linear(4096, 2)
    model.classifier[-1] = nn.Linear(4096, 2)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    print("Model loaded successfully")

    # Freeze all layers
    for param in model.parameters():
        param.requires_grad = False

    print("Model loaded and layers frozen successfully")

    # Add a forward hook to capture fc2 activations
    activations = {}

    def hook(module, input, output):
        activations["fc2"] = output

    # Attach the hook to the second last layer (fc2)
    #model.fc2[1].register_forward_hook(hook)
    model.classifier[4].register_forward_hook(hook)

    return model, activations





def preprocess_and_extract_activations(model, dataloader, layer):

    activations = []
    with torch.no_grad():
        for batch in dataloader:
            image_tensor = batch[0]  # Extract the image tensor (ignore labels)
            image_tensor = image_tensor.to(device)

            if layer < 13:  # Layer in model.features
                tensor = image_tensor
                for idx, layer_module in enumerate(model.features[:layer + 1]):
                    tensor = layer_module(tensor)
            else:  # Layer in model.classifier
                tensor = model.features(image_tensor)
                tensor = model.avgpool(tensor)
                tensor = torch.flatten(tensor, 1)
                for idx, layer_module in enumerate(model.classifier[:layer - 12]):
                    tensor = layer_module(tensor)
                    if idx == 4:
                      print(f"Extracting from FC2 (classifier[4]): {layer_module}")
                      print(f"Activation shape at FC2: {tensor.shape}")


            activation = tensor.cpu().numpy()
            activations.append(activation)

    print(f"Extracted activations for {len(activations)} images")
    return activations


def process_images_in_folder(model, folder_path, layer, is_two, batch_size=1):
    all_layer_activations = []

    # Ensure folder_path is a string
    if not isinstance(folder_path, str):
        raise ValueError(f"Expected folder_path to be a string, but got {type(folder_path)}")

    # Get all image file paths
    image_paths = [os.path.join(root, file)
                   for root, dirs, files in os.walk(folder_path)
                   for file in files if file.endswith(('.jpg', '.png'))]

    if len(image_paths) == 0:
        raise ValueError(f"No image files found in folder: {folder_path}")

    print(f"Initializing dataset with {len(image_paths)} images and is_two={is_two}")
    # Create the dataset and DataLoader
    dataset = ImageDataset(image_paths=image_paths, transform=preprocess)
    print(f"Dataset initialized with {len(dataset)} items.")
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Prepare dictionary for activations
    activations_dict = {}

    # Define the hook for the specified layer
    def hook(module, input, output):
        activations_dict[layer] = output

    # Register hook for the specified layer
    if layer == 'fc2':
        model.classifier[4].register_forward_hook(hook)  # Attach to fc2's Linear layer

    # Iterate over the dataloader to extract activations
    for images in dataloader:  # Only images are returned by the dataset
        if images is None:
            print("Skipping invalid batch.")
            continue

        images = images.to(next(model.parameters()).device)  # Move images to the same device as the model
        _ = model(images)  # Forward pass to trigger hooks

        # Collect activations from the specified layer
        if layer in activations_dict:
            all_layer_activations.append(activations_dict[layer].cpu().numpy())

    if len(all_layer_activations) == 0:
        raise ValueError("No activations were collected. Check dataset or model.")

    # Concatenate activations if batched
    all_layer_activations = np.concatenate(all_layer_activations, axis=0)

    # Free resources
    torch.cuda.empty_cache()
    gc.collect()

    return all_layer_activations




In [None]:
def flatten_and_align_activations(activations_list):
    print("Flattening and aligning activations")
    flat_activations = [act.flatten() for act in activations_list]
    max_length = max(len(act) for act in flat_activations)

    aligned_activations = []
    for activation in flat_activations:
        if len(activation) < max_length:
            padded_activation = np.pad(activation, (0, max_length - len(activation)), 'constant')
        else:
            padded_activation = activation[:max_length]
        aligned_activations.append(padded_activation)
    print(f"Aligned activations to shape: {np.vstack(aligned_activations).shape}")
    return np.vstack(aligned_activations)


def save_activations(activations, folder_name, filename): #?????
    try:
        drive_path = f'/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/individual/{folder_name}/alexnet_mnist_finetune_dlp/{filename}.npy'
        os.makedirs(os.path.dirname(drive_path), exist_ok=True)
        print(f"Saving activations to {drive_path}")
        np.save(drive_path, activations)
    except Exception as e:
        print(f"Error saving activations to {drive_path}: {e}")

def load_activations(folder_name, filename):
    try:
        drive_path = f'/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/individual/{folder_name}/alexnet_mnist_finetune_dlp/{filename}.npy'
        print(f"Loading activations from {drive_path}")
        return np.load(drive_path, allow_pickle=True)
    except FileNotFoundError:
        print(f"Activations not found at {drive_path}")
        return None

In [None]:
def compute_activations_for_layers(model_paths, folder_paths, layers, activations_file_prefix):
    # Initialize the dictionary with all keys in folder_paths
    all_layer_activations = {layer: {key: [] for key in folder_paths.keys()} for layer in layers}

    for folder_name, folder_path in folder_paths.items():
        print(f"Processing folder {folder_name}")

        # Get the is_two value from the mapping
        is_two = is_two_mapping.get(folder_name)
        if is_two is None:
            raise ValueError(f"Unknown folder name: {folder_name}. maybe update is_two_mapping.")

        for layer in layers:
            for model_idx, model_path in enumerate(model_paths):
                print(f"Processing model {model_idx + 1}/{len(model_paths)}")
                model, activations_dict = load_model(model_path)

                def hook(module, input, output):
                    activations_dict[layer] = output

                model.classifier[4].register_forward_hook(hook)

                # Pass is_two when creating the dataset
                activations = process_images_in_folder(model, folder_path, layer, is_two=is_two, batch_size=1)
                activations = flatten_and_align_activations(activations)

                # Save activations for this model and folder
                model_specific_file_prefix = f'{activations_file_prefix}_model{model_idx + 1}_{folder_name}'
                save_activations(activations, f'layer_{layer}', model_specific_file_prefix)

                all_layer_activations[layer][folder_name].append(activations)

                del model
                torch.cuda.empty_cache()
                gc.collect()

    return all_layer_activations






# Step 2: Extract Training Activations

model_paths = [
    "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_1.pt",
    "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_11.pt",
    "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_111.pt",
    "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_1111.pt",
    "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_11111.pt",
    "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_111111.pt",
    "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_1111111.pt"
]

layers_of_interest = ['fc2']

is_two_mapping = {
    'two_no_patch': 1,
    'zero_no_patch': 0,
    'zero_patch': 0
}


# Paths to training data
train_folder_paths = {
    'two_no_patch': '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/train/class_2',
    'zero_no_patch': '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/train/class_0',
    'zero_patch': '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/train/class_0'
}


val_folder_paths = {
    'two_no_patch': '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/val/class_2',
    'zero_no_patch': '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/val/class_0',
    'zero_patch': '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/val/class_0'
}


# Extract and save averaged training activations across three models
#train_activations = compute_activations_for_layers(model_paths, train_folder_paths, layers_of_interest, 'train')
#val_activations = compute_activations_for_layers(model_paths, val_folder_paths, layers_of_interest, 'val')

### Load saved activations of the fc2 alexnet training to later train the SAE

In [None]:
import os
import numpy as np

def load_saved_activations(subset, activations_file_prefix, model_idx=None, dataset_type="train"):
    """
    Load saved activations from files with the format: {dataset_type}_model{idx}_{subset}.npy.

    Parameters:
        subset (str): The data subset (e.g., "two_no_patch", "zero_no_patch", "zero_patch").
        activations_file_prefix (str): File prefix for the activations.
        model_idx (int or None): Specific model index to load (e.g., 1, 2, 3). If None, load all models.
        dataset_type (str): The dataset type ("train" or "val").

    Returns:
        np.ndarray or List[np.ndarray]: Loaded activations for a specific model (if model_idx is provided)
                                        or a list of activations for all models.
    """
    activations = []

    if model_idx is not None:
        # Load activations for a specific model
        drive_path = f'/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/individual/layer_fc2/alexnet_mnist_finetune_dlp/{dataset_type}_model{model_idx}_{subset}.npy'
        print(f"Loading activations from {drive_path}")

        if os.path.exists(drive_path):
            model_activations = np.load(drive_path, allow_pickle=True)
            print(f"Loaded activations for subset {subset}, dataset {dataset_type}, model {model_idx}. Shape: {model_activations.shape}")
            return model_activations
        else:
            print(f"Activations file {drive_path} does not exist.")
            return None
    else:
        # Load activations for all models
        for idx in range(1, 8):  # Adjust the range based on the number of models
            drive_path = f'/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/individual/layer_fc2/alexnet_mnist_finetune_dlp/{dataset_type}_model{idx}_{subset}.npy'
            print(f"Loading activations from {drive_path}")

            if os.path.exists(drive_path):
                model_activations = np.load(drive_path, allow_pickle=True)
                print(f"Loaded activations for subset {subset}, dataset {dataset_type}, model {idx}. Shape: {model_activations.shape}")
                activations.append(model_activations)
            else:
                print(f"Activations file {drive_path} does not exist.")

        if len(activations) > 0:
            print(f"Loaded activations for {len(activations)} model(s).")
            return activations  # List of numpy arrays, one for each model
        else:
            print("No activations files found.")
            return None


In [None]:
# Step 2: Define Sparse Autoencoder with KL-divergence
class SparseAutoencoder(nn.Module):
    def __init__(self, in_dims, h_dims, sparsity_lambda=1.5, sparsity_target=0.20, xavier_norm_init=True):
        super(SparseAutoencoder, self).__init__()
        self.in_dims = in_dims  # Input dimension (number of neurons in the input layer)
        self.h_dims = h_dims  # Hidden dimension (number of neurons in the hidden layer)
        self.sparsity_lambda = sparsity_lambda  # Weight for the sparsity penalty term
        self.sparsity_target = sparsity_target  # Target sparsity (desired average activation)

        # Encoder: Projects input to the hidden (sparse) space
        self.encoder = nn.Sequential(
            nn.Linear(self.in_dims, self.h_dims),
            nn.ReLU()
        )
        if xavier_norm_init:
            nn.init.xavier_uniform_(self.encoder[0].weight)  # Xavier initialization

        # Decoder: Reconstructs the input from the hidden (sparse) representation
        self.decoder = nn.Sequential(
            nn.Linear(self.h_dims, self.in_dims),
            #nn.ReLU()
        )
        if xavier_norm_init:
            nn.init.xavier_uniform_(self.decoder[0].weight)

    # Forward pass through the encoder and decoder
    def forward(self, x):
        encoded = self.encoder(x)  # Pass input through encoder
        decoded = self.decoder(encoded)  # Pass encoded (sparse) representation through decoder
        return encoded, decoded


    def kl_sparsity_penalty(self, encoded):
        # Penalize the average absolute activation
        rho_hat = torch.mean(torch.abs(encoded), dim=0)  # Average absolute activation per hidden unit
        rho = torch.ones_like(rho_hat) * self.sparsity_target  # Target sparsity value
        epsilon = 1e-8  # Small value to avoid log(0)

        # KL-divergence computation for sparsity
        kl_divergence = rho * torch.log(rho / (rho_hat + epsilon)) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat + epsilon))
        kl_divergence = torch.sum(kl_divergence)  # Sum over all hidden units

        return self.sparsity_lambda * kl_divergence



    # L1-norm sparsity penalty calculation
    def l1_sparsity_penalty(self, encoded):
        # Compute the mean of absolute values of activations
        sparsity_loss = torch.mean(torch.abs(encoded))  # Average absolute activation across all units
        return self.sparsity_lambda * sparsity_loss  # Scale by the sparsity weight


    # KL-divergence sparsity penalty calculation
    def old_kl_sparsity_penalty(self, encoded):
        rho_hat = torch.mean(encoded, dim=0)  # Compute the average activation for each hidden neuron
        rho = torch.ones_like(rho_hat) * self.sparsity_target  # Target sparsity value
        epsilon = 1e-8  # Small value to avoid log(0)
        kl_divergence = F.kl_div((rho_hat + epsilon).log(), rho + epsilon, reduction='batchmean')  # KL-divergence
        return self.sparsity_lambda * kl_divergence  # Return the sparsity penalty, weighted by lambda

    # Loss function combining MSE (reconstruction error) and sparsity penalty
    def loss_function(self, decoded, original, encoded):
        mse_loss = F.mse_loss(decoded, original)  # Mean Squared Error for reconstruction
        sparsity_loss = self.l1_sparsity_penalty(encoded)  # Sparsity penalty for hidden layer activations
        return mse_loss + sparsity_loss  # Total loss is MSE + sparsity penalty


# Early stopping mechanism to prevent overfitting
class EarlyStopping:
    def __init__(self, patience=20, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta  # Minimum change to qualify as an improvement
        self.best_loss = None  # Best validation loss observed so far
        self.counter = 0  # Counter to keep track of how many epochs since the last improvement

    # Check if training should be stopped based on validation loss
    def check(self, loss):
        if self.best_loss is None:
            self.best_loss = loss  # Set the initial best loss
            return False

        # If the loss has improved significantly
        if loss < self.best_loss - self.min_delta:
            self.best_loss = loss  # Update best loss
            self.counter = 0  # Reset counter
            return False
        else:
            self.counter += 1  # Increment counter if no improvement
            if self.counter >= self.patience:
                print("Early stopping triggered.")  # Stop training if patience is exceeded
                return True
        return False


# Training Function with Loss Tracking and Plotting
def train_autoencoder(autoencoder, train_data, val_data, num_epochs=500, batch_size=128, learning_rate=1e-4, validation_split=0.2, clip_gradients=True, max_grad_norm=0.5):
    #print(f"Training autoencoder with input dim {data.shape[1]} and encoding dim {autoencoder.h_dims}")

    # Split the data into training and validation sets
    #num_train = int((1 - validation_split) * len(data))  # Compute the number of training samples
    #train_data = data[:num_train]  # Training data
    #val_data = data[num_train:]  # Validation data

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_tensor = torch.from_numpy(train_data).float()
    val_tensor = torch.from_numpy(val_data).float()
    # Create PyTorch datasets and dataloaders for training and validation
    train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(train_data).float())
    val_dataset = torch.utils.data.TensorDataset(torch.from_numpy(val_data).float())
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Adam optimizer with learning rate and weight decay for regularization
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr=learning_rate, weight_decay=1e-2)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=30, min_delta=0.001)

    # Lists to store training and validation loss values
    train_losses = []
    val_losses = []

    # Start training loop
    autoencoder.train()  # Set the autoencoder in training mode
    for epoch in range(num_epochs):
        total_train_loss = 0  # Initialize training loss for the current epoch

        # Iterate over batches in the training set
        for x_batch, in train_loader:
            x_batch = x_batch.to(device)
            optimizer.zero_grad()  # Zero the gradients
            encoded, decoded = autoencoder(x_batch)  # Forward pass through autoencoder
            loss = autoencoder.loss_function(decoded, x_batch, encoded)  # Compute the loss
            loss.backward()  # Backpropagate the error

            # Apply gradient clipping if enabled
            if clip_gradients:
                torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_grad_norm)

            optimizer.step()  # Update the weights using the optimizer
            total_train_loss += loss.item()  # Accumulate training loss for this batch

        # Validation step after each epoch
        total_val_loss = 0  # Initialize validation loss
        autoencoder.eval()  # Set the autoencoder in evaluation mode
        with torch.no_grad():  # No gradient calculation in validation mode
            for x_batch, in val_loader:
                x_batch = x_batch.to(device)
                encoded, decoded = autoencoder(x_batch)
                loss = autoencoder.loss_function(decoded, x_batch, encoded)
                total_val_loss += loss.item()

        # Compute the average training and validation loss for this epoch
        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)

        # Store the loss values for plotting later
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)

        # Print progress for the current epoch
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss}, Val Loss: {avg_val_loss}')

        # Check early stopping condition based on validation loss
        if early_stopping.check(avg_val_loss):
            break

    print("Autoencoder training completed")

    # Plot the training and validation loss over epochs
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
    plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()

    return autoencoder


In [None]:
import os
import numpy as np
import torch
import gc
'''
# Define the directory to save autoencoders trained on normalized activations
save_sae_dir = '/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/dynamic_left_patch'  # Directory to save the trained autoencoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(save_sae_dir, exist_ok=True)
layers_of_interest = ['fc2']

# Iterate through layers of interest
for layer in layers_of_interest:
    print(f'\nTraining autoencoders for layer {layer}')

    # Iterate through seeds (corresponding to models)
    for seed_idx, seed in enumerate([1, 11, 111, 1111, 11111, 111111, 1111111], start=1):
        print(f"\nProcessing activations for layer {layer}, seed {seed} (Model {seed_idx})")

        # Get training activations for the current seed and layer
        train_activations_list = []
        print(f"\nTrain files associated with seed {seed}:")
        for idx in range(1, 8): #!!!!!!!!!!!!!!
            for subset in ['two_no_patch', 'zero_no_patch', 'zero_patch'] :
                #train_file_path = f'/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/individual/layer_fc2/alexnet_mnist_finetune_fg/train_model{idx}_{subset}.npy'
                train_file_path = f'/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/individual/layer_fc2/dynamic_left_patch/train_model{idx}_{subset}.npy'
                if os.path.exists(train_file_path):
                    subset_activations = np.load(train_file_path)  # Load the .npy file
                    train_activations_list.append(subset_activations)
                    print(f"  - Subset: {subset}, File: {train_file_path}")
                else:
                    print(f"Warning: File not found - {train_file_path}")

        if len(train_activations_list) == 0:
            print(f"No train activations found for layer {layer}, seed {seed}. Skipping...")
            continue

        # Combine train activations
        combined_train_activations = np.vstack(train_activations_list)
        print(f"Combined train activations shape for layer {layer}, seed {seed}: {combined_train_activations.shape}")

        # Get validation activations for the current seed and layer
        val_activations_list = []
        print(f"\nValidation files associated with seed {seed}:")
        for idx in range(1, 8): #!!!!!!!!!
            for subset in ['two_no_patch', 'zero_no_patch', 'zero_patch']:
                val_file_path = f'/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/individual/layer_fc2/dynamic_left_patch/val_model{idx}_{subset}.npy'
                if os.path.exists(val_file_path):
                    subset_activations = np.load(val_file_path)  # Load the .npy file
                    val_activations_list.append(subset_activations)
                    print(f"  - Subset: {subset}, File: {val_file_path}")
                else:
                    print(f"Warning: File not found - {val_file_path}")

        if len(val_activations_list) == 0:
            print(f"No val activations found for layer {layer}, seed {seed}. Skipping...")
            continue

        # Combine val activations
        combined_val_activations = np.vstack(val_activations_list)
        print(f"Combined val activations shape for layer {layer}, seed {seed}: {combined_val_activations.shape}")

        # Set the random seed for reproducibility
        torch.manual_seed(seed)
        np.random.seed(seed)

        # Define encoding dimension
        encoding_dim = 16000

        # Train autoencoder on unnormalized activations
        autoencoder = SparseAutoencoder(combined_train_activations.shape[1], encoding_dim).to(device)
        print(f"Training autoencoder for layer {layer} (unnormalized, seed {seed})")
        autoencoder = train_autoencoder(autoencoder, combined_train_activations, combined_val_activations,
                                        num_epochs=400, learning_rate=1e-5)
        # Print losses for inspection
        #print(f"Training Losses for layer {layer}, seed {seed}: {train_losses}")

        #print(f"Validation Losses for layer {layer}, seed {seed}: {val_losses}")
        # Save the trained autoencoder
        save_path_unnormalized = os.path.join(
            save_sae_dir, f'l1_dlp_autoencoder_layer_{layer}_seed_{seed}.pth'
        )
        torch.save(autoencoder.state_dict(), save_path_unnormalized)
        print(f"Saved autoencoder at {save_path_unnormalized}")

        # Clear memory
        del autoencoder
        torch.cuda.empty_cache()
        gc.collect()

print("\nAll autoencoders trained and saved successfully.")
'''



In [None]:
'''
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Define the directory to save autoencoders trained on normalized activations
save_sae_dir = '/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/dynamic_left_patch'  # Directory to save the trained autoencoder
os.makedirs(save_sae_dir, exist_ok=True)

# Iterate through layers of interest
for layer in layers_of_interest:
    print(f'\nTraining autoencoders for layer {layer}')

    # Iterate through seeds (corresponding to models)
    for seed_idx, seed in enumerate([1, 11, 111, 1111, 11111, 111111, 1111111], start=1):
        print(f"\nProcessing activations for layer {layer}, seed {seed} (Model {seed_idx})")

        # Get training activations for the current seed and layer
        train_activations_list = []
        print(f"\nTrain files associated with seed {seed}:")
        for subset in ['two_no_patch', 'zero_no_patch', 'zero_patch']:
            if subset in train_activations[layer]:
                subset_activations = train_activations[layer][subset][seed_idx - 1]  # Seed index starts from 1, Python lists are 0-based
                train_activations_list.append(subset_activations)
                print(f"  - Subset: {subset}, File: train_model{seed_idx}_{subset}.npy")
            else:
                print(f"Warning: No train activations found for subset {subset}, layer {layer}, seed {seed}.")

        if len(train_activations_list) == 0:
            print(f"No train activations found for layer {layer}, seed {seed}. Skipping...")
            continue

        # Combine train activations
        combined_train_activations = np.vstack(train_activations_list)
        print(f"Combined train activations shape for layer {layer}, seed {seed}: {combined_train_activations.shape}")

        # Get validation activations for the current seed and layer
        val_activations_list = []
        print(f"\nValidation files associated with seed {seed}:")
        for subset in ['two_no_patch', 'zero_no_patch', 'zero_patch']:
            if subset in val_activations[layer]:
                subset_activations = val_activations[layer][subset][seed_idx - 1]
                val_activations_list.append(subset_activations)
                print(f"  - Subset: {subset}, File: val_model{seed_idx}_{subset}.npy")
            else:
                print(f"Warning: No val activations found for subset {subset}, layer {layer}, seed {seed}.")

        if len(val_activations_list) == 0:
            print(f"No val activations found for layer {layer}, seed {seed}. Skipping...")
            continue

        # Combine val activations
        combined_val_activations = np.vstack(val_activations_list)
        print(f"Combined val activations shape for layer {layer}, seed {seed}: {combined_val_activations.shape}")

        # Set the random seed for reproducibility
        torch.manual_seed(seed)
        np.random.seed(seed)

        # Define encoding dimension
        encoding_dim = 8000
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Train autoencoder on unnormalized activations
        autoencoder = SparseAutoencoder(combined_train_activations.shape[1], encoding_dim).to(device)
        print(f"Training autoencoder for layer {layer} (unnormalized, seed {seed})")
        autoencoder = train_autoencoder(autoencoder, combined_train_activations, combined_val_activations,
                                        num_epochs=500, learning_rate=1e-4)

        # Save the trained autoencoder
        save_path_unnormalized = os.path.join(
            save_sae_dir, f'dlp_autoencoder_layer_{layer}_seed_{seed}.pth'
        )
        torch.save(autoencoder.state_dict(), save_path_unnormalized)
        print(f"Saved unnormalized autoencoder at {save_path_unnormalized}")

        # Clear memory
        del autoencoder
        torch.cuda.empty_cache()
        gc.collect()

print("\nAll autoencoders trained and saved successfully.")
'''



### Load the saved SAE and project

In [None]:
# Load the pre-trained autoencoder for layer 6 (fc2) (from snippet 4)
def load_autoencoder(device, save_sae_dir):

    input_dims = 4096
    encoding_dim = 8000

    # Initialize the autoencoder
    autoencoder = SparseAutoencoder(input_dims, encoding_dim)
    autoencoder.load_state_dict(torch.load(save_sae_dir))  # Load weights
    autoencoder = autoencoder.to(device)  # Move model to device

    # Freeze all parameters of the autoencoder
    for param in autoencoder.parameters():
        param.requires_grad = False

    # Set the autoencoder to evaluation mode
    autoencoder.eval()
    print(f"Autoencoder loaded from {save_sae_dir} and frozen successfully.")
    return autoencoder




def load_model(saved_weights_path, device):

    # Load the PyTorch AlexNet model
    print(f"Loading model from {saved_weights_path}")
    model = models.alexnet(pretrained=False)  # Load AlexNet without pretrained weights
    model.classifier[-1] = nn.Linear(4096, 2)  # Update the last layer for binary classification

    # Load the saved weights
    model.load_state_dict(torch.load(saved_weights_path, map_location=device))
    model.to(device)

    # Freeze all layers except `classifier[5]` (ReLU) and `classifier[6]` (fc3)
    for name, param in model.named_parameters():
        if name.startswith("classifier.6"):  # classifier[6] corresponds to fc3
            param.requires_grad = True
        elif name.startswith("classifier.5"):  # ReLU does not have trainable params
            param.requires_grad = True
        else:
            param.requires_grad = False

    # Set the model to evaluation mode
    model.eval()
    print("Model loaded and all layers up to fc2 are frozen")
    return model






In [None]:
from pathlib import Path

base_dir = "/content/drive/MyDrive/Masterthesis/Datasets/mnist"
activation_dir = os.path.join(base_dir, "activations")
output_base_dir = os.path.join(base_dir, "outputs")
Path(output_base_dir).mkdir(parents=True, exist_ok=True)



# Define paths for pre-saved activations
def get_activation_path(folder_name, filename):
    return os.path.join(activation_dir, folder_name, f"{filename}.npy")


def extract_fc2_activations(model, dataloader):
    """
    Extract activations from the fc2 layer (classifier[4]) of the PyTorch AlexNet.
    Args:
        model: Pretrained or fine-tuned PyTorch AlexNet model.
        dataloader: DataLoader for the test set.

    Returns:
        numpy.ndarray: Activations from fc2 for all images in the dataloader.
    """
    print("Extracting AlexNet activations for layer fc2...")
    activations = []

    # Ensure the model is in evaluation mode
    model.eval()

    with torch.no_grad():  # Disable gradient computation
        for batch_idx, (image_tensor, _) in enumerate(dataloader):  # Expect only 2 elements: (img, is_two)
            # Move image tensor to the device (CPU or GPU)
            image_tensor = image_tensor.to(device)

            # Pass through the feature extractor
            features = model.features(image_tensor)
            features = model.avgpool(features)  # Apply average pooling
            features = torch.flatten(features, 1)  # Flatten for classifier input

            # Pass through classifier layers up to fc2
            for idx, layer in enumerate(model.classifier):
                features = layer(features)
                if idx == 4:  # Stop after fc2 (classifier[4])
                    activations.append(features.cpu().numpy())
                    break

            # Log progress
            if (batch_idx + 1) % 10 == 0:
                print(f"Processed {batch_idx + 1}/{len(dataloader)} batches")

            # Clear resources
            torch.cuda.empty_cache()
            gc.collect()

    # Stack activations into a single array
    return np.vstack(activations)




def load_or_extract_fc2_activations(model, dataloader, folder_name, filename):
    # Generate activation file path
    activation_path = get_activation_path(folder_name, filename)

    # Check if the activation file exists
    if os.path.exists(activation_path):
        print(f"Loading pre-saved AlexNet activations for {filename} from {activation_path}...")
        activations = np.load(activation_path, allow_pickle=True)
    else:
        print(f"No pre-saved AlexNet activations found for {filename}. Extracting and saving...")
        activations = extract_fc2_activations(model, dataloader)  # Extract activations
        os.makedirs(os.path.dirname(activation_path), exist_ok=True)  # Ensure directory exists
        np.save(activation_path, activations)  # Save activations
        print(f"Activations for layer fc2 saved to {activation_path}")

    return activations




till here

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from sklearn.metrics import accuracy_score

def main():
    # Paths and initialization
    model_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_1.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_11.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_111.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_1111.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_11111.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_111111.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_1111111.pt"

    ]


    # Define dataset paths
    dataset_paths = {
        "test_two_dlp_100": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/test/class_2_100',
        "test_two_org_100": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/test/class_2_100',
        "test_zero_dlp_100": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/test/class_0_100',
        "test_zero_org_100": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/test/class_0_100',
        "val_zero_org_200": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/val/class_0_200',
        "val_zero_org_100": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/val/class_0_100',
        "val_zero_org_50": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/val/class_0_50',
        "val_zero_org_25": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/val/class_0_25',
        "val_zero_dlp_200": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/val/class_0_200',
        "val_zero_dlp_100": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/val/class_0_100',
        "val_zero_dlp_50": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/val/class_0_50',
        "val_zero_dlp_25": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/dynamic_patches_left/val/class_0_25',


        #"val_zero_fg": '/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/foreground/val/class_0'
    }



    for key, folder in dataset_paths.items():
        if not os.path.exists(folder):
            print(f"Path does not exist: {folder}")
        else:
            files = [
                os.path.join(root, file)
                for root, dirs, files in os.walk(folder)
                for file in files if file.endswith(('.jpg', '.png'))
            ]
            if not files:
                print(f"No valid image files found in: {folder}")
            else:
                print(f"Found {len(files)} files in {folder}")


    base_path = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/dataset_splits/original/val"
    print("Contents of the directory:")
    for item in os.listdir(base_path):
        print(item)



    # Prepare dataloaders
    dataloaders = {}
    for key, folder in dataset_paths.items():
        image_paths = [
            os.path.join(root, file)
            for root, dirs, files in os.walk(folder)
            for file in files if file.endswith(('.jpg', '.png'))
        ]
        # Use is_two = 1 if "two" is in the key, else 0
        dataset = MnistDataset(data_files=image_paths, is_two=1 if "two" in key else 0)
        dataloaders[key] = DataLoader(dataset, batch_size=1, shuffle=False)

    # Directory for saving results
    sparse_output_dir = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch"
    os.makedirs(sparse_output_dir, exist_ok=True)

    # Loop over models
    for model_path in model_paths:
        print(f"Processing model: {model_path}")

        # Load the model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = load_model(model_path, device)

        # Process each dataset
        for key, loader in dataloaders.items():
            print(f"Processing dataset: {key}")

            # Extract activations
            activations = load_or_extract_fc2_activations(
                model, loader, f'{key}_{Path(model_path).stem}', f'dlp_fc2_activations_{key}_{Path(model_path).stem}'
            )

            # Save activations
            activation_path = os.path.join(sparse_output_dir, f"fc2_activations_{key}_{Path(model_path).stem}.npy")
            np.save(activation_path, activations)
            print(f"Activations for {key} saved to: {activation_path}")

main()


### 200

In [None]:
from sklearn.metrics import accuracy_score
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function 12 to classify decoded activations
def classify_decoded_activations(model, decoded_activations):
    """Classify decoded activations using the softmax layer of the model."""
    predictions = []
    for activation in decoded_activations:
        # Convert numpy activation to a tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)

        # Pass through classifier[5] (ReLU)
        relu_output = model.classifier[5](activation_tensor)

        # Pass through classifier[6] (final linear layer)
        output = model.classifier[6](relu_output)

        # Apply softmax and get the predicted class
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)

    return predictions

def save_neurons_by_correlation(val_patch_activations, val_no_patch_activations, folder_name, filename="neurons_by_correlation_200.csv"):
    os.makedirs(folder_name, exist_ok=True)

    # Number of neurons should be 16k
    num_neurons = val_patch_activations.shape[1]
    print(f"Number of val patch neurons: {num_neurons}")
    num_images = val_patch_activations.shape[0]
    print(f"Number of val patch images: {val_patch_activations.shape[0]}")

    # Create a binary label vector pp (1 for patch, 0 for no patch)
    pp = np.concatenate([np.ones(val_patch_activations.shape[0]), np.zeros(val_no_patch_activations.shape[0])])
    print(f"Number of pp labels: {len(pp)}")

    # array to store correlations
    correlations = np.zeros(num_neurons)

    for i in range(num_neurons):
        # Combine activations for neuron i from both datassets p and np
        act_i = np.concatenate([val_patch_activations[:, i], val_no_patch_activations[:, i]])

        # Compute correlation between pp and act_i
        if np.std(pp) > 0 and np.std(act_i) > 0:
            corr = np.corrcoef(pp, act_i)[0, 1]
        else:
            corr = 0  # Handle constant vectors

        correlations[i] = corr

    # As previous code
    # Handle NaN correlations (replace NaN with 0)
    correlations = np.nan_to_num(correlations)
    print('shape of correlations', correlations.shape)

    # Create a DataFrame with neuron indices and their correlations
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(num_neurons),
        "Correlation": correlations
    })

    # Sort by correlation in descending order
    neuron_data.sort_values(by="Correlation", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"Neuron correlations saved at: {csv_path}")

    return csv_path

def plot_correlation_histogram(original, projected, title):
    correlations = [
        np.corrcoef(original[:, i], projected[:, i])[0, 1]
        if np.std(original[:, i]) > 0 and np.std(projected[:, i]) > 0
        else 0  # Handle constant features
        for i in range(original.shape[1])
    ]
    correlations = np.nan_to_num(correlations)  # Replace NaNs with 0
    plt.hist(correlations, bins=50, alpha=0.7)
    plt.title(title)
    plt.xlabel("Correlation")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()


# Project activations into sparse space
def project_activations(autoencoder, activations, device):
    print("Projecting Alexnet activations into SAE sparse space...")
    with torch.no_grad():
        projected = autoencoder.encoder(torch.from_numpy(activations).to(device).float())
    return projected.cpu().numpy()

# Save all neuron differences and indexes in descending order
def save_all_neurons_to_csv(abs_diff, folder_name, filename="all_neuron_differences_200.csv"):
    """
    Save all neuron indexes sorted by their differences (descending order) to a CSV file.
    """
    print(f"Saving all neuron differences to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

    return csv_path # return the csv_path



def save_top_neurons_to_csv(abs_diff, top_neurons, folder_name, filename="dlp_top_neurons_200.csv"):
    """
    Save the top neurons with their difference values to a CSV file.
    """
    print(f"Saving top neurons to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": range(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Mark whether each neuron is in the top 10%
    neuron_data["Selected_for_Muting"] = neuron_data["Neuron_Index"].isin(top_neurons)

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

# Function to load top neurons from CSV based on a percentage
def load_top_neurons_from_csv(folder_name, filename, percentage):
    """
    Load top neurons based on the specified percentage from the saved CSV file.
    """
    csv_path = os.path.join(folder_name, filename)
    neuron_data = pd.read_csv(csv_path)

    # Calculate the number of top neurons to select
    top_count = int(len(neuron_data) * (percentage / 100))

    # Select the top neurons based on their activation difference
    top_neurons = neuron_data.iloc[:top_count]["Neuron_Index"].values
    print(f"Loaded top {percentage}% neurons ({top_count} neurons) for muting.")
    return top_neurons


def classify_with_alexnet(model, activations):
    """
    Classify images using the original AlexNet classifier on the fc2 activations.
    """
    predictions = []
    for activation in activations:
        # Convert numpy activation to tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)
        relu_output = model.classifier[5](activation_tensor)  # Apply ReLU
        output = model.classifier[6](relu_output)  # Apply fc3
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions

# Function to calculate accuracy per group
def calculate_group_accuracy(predictions, true_labels):
    return accuracy_score(true_labels, predictions)


def main():

    seeds = [1, 11, 111, 1111, 11111, 111111, 1111111]

    # Paths to models, activations, and autoencoders for each seed
    model_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_{seed}.pt"
        for seed in seeds
    ]
    autoencoder_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/dynamic_left_patch/dlp_autoencoder_layer_fc2_seed_{seed}.pth"
        for seed in seeds
    ]
    test_activation_two_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_two_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_dlp_200_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_org_200_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]


    # Output folder for results
    folder_name = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/difference_analysis/dynamic_left_patch"
    os.makedirs(folder_name, exist_ok=True)

    # Prepare a DataFrame to store results
    results_df = pd.DataFrame(columns=[
        "Seed", "Percentage",
        "Val_Patch_Before", "Val_NoPatch_Before",
        "Val_Patch_After", "Val_NoPatch_After",
        "Test_Two_Patch_Before", "Test_Two_NoPatch_Before",
        "Test_Two_Patch_After", "Test_Two_NoPatch_After",
        "Test_Zero_Patch_Before", "Test_Zero_NoPatch_Before",
        "Test_Zero_Patch_After", "Test_Zero_NoPatch_After",
        "Worst_Acc_Before", "Worst_Acc_After",
        "Avg_Acc_Before", "Avg_Acc_After"
    ])

    # Loop through seeds/models/autoencoders
    for seed_idx, seed in enumerate(seeds):
        print(f"\nProcessing Seed {seed}")

        # Load paths
        model_path = model_paths[seed_idx]
        sae_path = autoencoder_paths[seed_idx]
        val_patch_path = val_activation_patch_paths[seed_idx]
        val_no_patch_path = val_activation_no_patch_paths[seed_idx]
        test_two_patch_path = test_activation_two_patch_paths[seed_idx]
        test_two_no_patch_path = test_activation_two_no_patch_paths[seed_idx]
        test_zero_patch_path = test_activation_zero_patch_paths[seed_idx]
        test_zero_no_patch_path = test_activation_zero_no_patch_paths[seed_idx]

        # Load the model and autoencoder
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = load_model(model_path, device)
        autoencoder = load_autoencoder(device, sae_path)

        # Load pre-saved activations
        val_activations_patch = np.load(val_patch_path, allow_pickle=True)
        val_activations_no_patch = np.load(val_no_patch_path, allow_pickle=True)

        test_two_patch = np.load(test_two_patch_path, allow_pickle=True)
        test_two_no_patch = np.load(test_two_no_patch_path, allow_pickle=True)
        test_zero_patch = np.load(test_zero_patch_path, allow_pickle=True)
        test_zero_no_patch = np.load(test_zero_no_patch_path, allow_pickle=True)

        # Classify "before muting"
        predictions_val_patch_before = classify_with_alexnet(model, val_activations_patch)
        predictions_val_no_patch_before = classify_with_alexnet(model, val_activations_no_patch)
        accuracy_val_patch_before = calculate_group_accuracy(predictions_val_patch_before, [0] * len(predictions_val_patch_before))
        accuracy_val_no_patch_before = calculate_group_accuracy(predictions_val_no_patch_before, [0] * len(predictions_val_no_patch_before))

        predictions_test_two_patch_before = classify_with_alexnet(model, test_two_patch)
        predictions_test_two_no_patch_before = classify_with_alexnet(model, test_two_no_patch)
        predictions_test_zero_patch_before = classify_with_alexnet(model, test_zero_patch)
        predictions_test_zero_no_patch_before = classify_with_alexnet(model, test_zero_no_patch)

        accuracy_test_two_patch_before = calculate_group_accuracy(predictions_test_two_patch_before, [1] * len(predictions_test_two_patch_before))
        accuracy_test_two_no_patch_before = calculate_group_accuracy(predictions_test_two_no_patch_before, [1] * len(predictions_test_two_no_patch_before))
        accuracy_test_zero_patch_before = calculate_group_accuracy(predictions_test_zero_patch_before, [0] * len(predictions_test_zero_patch_before))
        accuracy_test_zero_no_patch_before = calculate_group_accuracy(predictions_test_zero_no_patch_before, [0] * len(predictions_test_zero_no_patch_before))

        # Project activations
        projected_val_patch = project_activations(autoencoder, val_activations_patch, device)
        projected_val_no_patch = project_activations(autoencoder, val_activations_no_patch, device)
        projected_two_test_patch = project_activations(autoencoder, test_two_patch, device)
        projected_two_test_no_patch = project_activations(autoencoder, test_two_no_patch, device)
        projected_zero_test_patch = project_activations(autoencoder, test_zero_patch, device)
        projected_zero_test_no_patch = project_activations(autoencoder, test_zero_no_patch, device)

        # Loop through percentages
        for percentage in range(0, 51):  # 1% to 15%
            print(f"Testing with {percentage}% muting...")

            # Calculate differences and load top neurons
            #abs_diff = np.abs(np.mean(projected_val_patch, axis=0) - np.mean(projected_val_no_patch, axis=0))
            #csv_path = save_all_neurons_to_csv(abs_diff, folder_name, filename=f"mnist_dlp_val_neuron_differences_seed_{seed}.csv")
            #top_neurons = load_top_neurons_from_csv(folder_name, f"mnist_dlp_val_neuron_differences_seed_{seed}.csv", percentage)

            # **Correlation-Based Neurons**
            correlation_csv_path = save_neurons_by_correlation(
                projected_val_patch, projected_val_no_patch, folder_name, filename=f"neurons_by_correlation_seed_{seed}_200.csv"
            )
            top_neurons_corr = load_top_neurons_from_csv(folder_name, f"neurons_by_correlation_seed_{seed}_200.csv", percentage)

            top_neurons = top_neurons_corr
            # Muting neurons
            projected_val_patch_muted = projected_val_patch.copy()
            projected_val_no_patch_muted = projected_val_no_patch.copy()
            projected_two_test_patch_muted = projected_two_test_patch.copy()
            projected_two_test_no_patch_muted = projected_two_test_no_patch.copy()
            projected_zero_test_patch_muted = projected_zero_test_patch.copy()
            projected_zero_test_no_patch_muted = projected_zero_test_no_patch.copy()

            projected_val_patch_muted[:, top_neurons] = 0
            projected_val_no_patch_muted[:, top_neurons] = 0
            projected_two_test_patch_muted[:, top_neurons] = 0
            projected_two_test_no_patch_muted[:, top_neurons] = 0
            projected_zero_test_patch_muted[:, top_neurons] = 0
            projected_zero_test_no_patch_muted[:, top_neurons] = 0

            # Decode and classify
            decoded_val_patch = autoencoder.decoder(torch.from_numpy(projected_val_patch_muted).to(device).float()).cpu().numpy()
            decoded_val_no_patch = autoencoder.decoder(torch.from_numpy(projected_val_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_no_patch_muted).to(device).float()).cpu().numpy()

            predictions_val_patch_after = classify_decoded_activations(model, decoded_val_patch)
            predictions_val_no_patch_after = classify_decoded_activations(model, decoded_val_no_patch)
            predictions_test_two_patch_after = classify_decoded_activations(model, decoded_two_test_patch)
            predictions_test_two_no_patch_after = classify_decoded_activations(model, decoded_two_test_no_patch)
            predictions_test_zero_patch_after = classify_decoded_activations(model, decoded_zero_test_patch)
            predictions_test_zero_no_patch_after = classify_decoded_activations(model, decoded_zero_test_no_patch)

            accuracy_val_patch_after = calculate_group_accuracy(predictions_val_patch_after, [0] * len(predictions_val_patch_after))
            accuracy_val_no_patch_after = calculate_group_accuracy(predictions_val_no_patch_after, [0] * len(predictions_val_no_patch_after))
            accuracy_test_two_patch_after = calculate_group_accuracy(predictions_test_two_patch_after, [1] * len(predictions_test_two_patch_after))
            accuracy_test_two_no_patch_after = calculate_group_accuracy(predictions_test_two_no_patch_after, [1] * len(predictions_test_two_no_patch_after))
            accuracy_test_zero_patch_after = calculate_group_accuracy(predictions_test_zero_patch_after, [0] * len(predictions_test_zero_patch_after))
            accuracy_test_zero_no_patch_after = calculate_group_accuracy(predictions_test_zero_no_patch_after, [0] * len(predictions_test_zero_no_patch_after))

            # Calculate worst and average group accuracies
            worst_acc_before = min(accuracy_test_two_patch_before, accuracy_test_two_no_patch_before)
            worst_acc_after = min(accuracy_test_two_patch_after, accuracy_test_two_no_patch_after)
            avg_acc_before = (accuracy_test_two_patch_before + accuracy_test_two_no_patch_before +
                              accuracy_test_zero_patch_before + accuracy_test_zero_no_patch_before) / 4
            avg_acc_after = (accuracy_test_two_patch_after + accuracy_test_two_no_patch_after +
                             accuracy_test_zero_patch_after + accuracy_test_zero_no_patch_after) / 4

            # Append results to DataFrame
            new_row = pd.DataFrame([{
                "Seed": seed,
                "Percentage": percentage,
                "Val_Patch_Before": accuracy_val_patch_before,
                "Val_NoPatch_Before": accuracy_val_no_patch_before,
                "Val_Patch_After": accuracy_val_patch_after,
                "Val_NoPatch_After": accuracy_val_no_patch_after,
                "Test_Two_Patch_Before": accuracy_test_two_patch_before,
                "Test_Two_NoPatch_Before": accuracy_test_two_no_patch_before,
                "Test_Two_Patch_After": accuracy_test_two_patch_after,
                "Test_Two_NoPatch_After": accuracy_test_two_no_patch_after,
                "Test_Zero_Patch_Before": accuracy_test_zero_patch_before,
                "Test_Zero_NoPatch_Before": accuracy_test_zero_no_patch_before,
                "Test_Zero_Patch_After": accuracy_test_zero_patch_after,
                "Test_Zero_NoPatch_After": accuracy_test_zero_no_patch_after,
                "Worst_Acc_Before": worst_acc_before,
                "Worst_Acc_After": worst_acc_after,
                "Avg_Acc_Before": avg_acc_before,
                "Avg_Acc_After": avg_acc_after
            }])

            # Save results to CSV
            results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_csv_path = os.path.join(folder_name, "dynamic_left_patch_200.csv")
    results_df.to_csv(results_csv_path, index=False)
    print(f"All results saved to {results_csv_path}")

if __name__ == "__main__":
    main()

### 100

In [None]:
from sklearn.metrics import accuracy_score
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function 12 to classify decoded activations
def classify_decoded_activations(model, decoded_activations):
    """Classify decoded activations using the softmax layer of the model."""
    predictions = []
    for activation in decoded_activations:
        # Convert numpy activation to a tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)

        # Pass through classifier[5] (ReLU)
        relu_output = model.classifier[5](activation_tensor)

        # Pass through classifier[6] (final linear layer)
        output = model.classifier[6](relu_output)

        # Apply softmax and get the predicted class
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)

    return predictions

def save_neurons_by_correlation(val_patch_activations, val_no_patch_activations, folder_name, filename="neurons_by_correlation_100.csv"):
    os.makedirs(folder_name, exist_ok=True)

    # Number of neurons should be 16k
    num_neurons = val_patch_activations.shape[1]
    print(f"Number of val patch neurons: {num_neurons}")
    num_images = val_patch_activations.shape[0]
    print(f"Number of val patch images: {val_patch_activations.shape[0]}")

    # Create a binary label vector pp (1 for patch, 0 for no patch)
    pp = np.concatenate([np.ones(val_patch_activations.shape[0]), np.zeros(val_no_patch_activations.shape[0])])
    print(f"Number of pp labels: {len(pp)}")

    # array to store correlations
    correlations = np.zeros(num_neurons)

    for i in range(num_neurons):
        # Combine activations for neuron i from both datassets p and np
        act_i = np.concatenate([val_patch_activations[:, i], val_no_patch_activations[:, i]])

        # Compute correlation between pp and act_i
        if np.std(pp) > 0 and np.std(act_i) > 0:
            corr = np.corrcoef(pp, act_i)[0, 1]
        else:
            corr = 0  # Handle constant vectors

        correlations[i] = corr

    # As previous code
    # Handle NaN correlations (replace NaN with 0)
    correlations = np.nan_to_num(correlations)
    print('shape of correlations', correlations.shape)

    # Create a DataFrame with neuron indices and their correlations
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(num_neurons),
        "Correlation": correlations
    })

    # Sort by correlation in descending order
    neuron_data.sort_values(by="Correlation", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"Neuron correlations saved at: {csv_path}")

    return csv_path

def plot_correlation_histogram(original, projected, title):
    correlations = [
        np.corrcoef(original[:, i], projected[:, i])[0, 1]
        if np.std(original[:, i]) > 0 and np.std(projected[:, i]) > 0
        else 0  # Handle constant features
        for i in range(original.shape[1])
    ]
    correlations = np.nan_to_num(correlations)  # Replace NaNs with 0
    plt.hist(correlations, bins=50, alpha=0.7)
    plt.title(title)
    plt.xlabel("Correlation")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()


# Project activations into sparse space
def project_activations(autoencoder, activations, device):
    print("Projecting Alexnet activations into SAE sparse space...")
    with torch.no_grad():
        projected = autoencoder.encoder(torch.from_numpy(activations).to(device).float())
    return projected.cpu().numpy()

# Save all neuron differences and indexes in descending order
def save_all_neurons_to_csv(abs_diff, folder_name, filename="all_neuron_differences_100.csv"):
    """
    Save all neuron indexes sorted by their differences (descending order) to a CSV file.
    """
    print(f"Saving all neuron differences to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

    return csv_path # return the csv_path



def save_top_neurons_to_csv(abs_diff, top_neurons, folder_name, filename="dlp_top_neurons_100.csv"):
    """
    Save the top neurons with their difference values to a CSV file.
    """
    print(f"Saving top neurons to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": range(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Mark whether each neuron is in the top 10%
    neuron_data["Selected_for_Muting"] = neuron_data["Neuron_Index"].isin(top_neurons)

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

# Function to load top neurons from CSV based on a percentage
def load_top_neurons_from_csv(folder_name, filename, percentage):
    """
    Load top neurons based on the specified percentage from the saved CSV file.
    """
    csv_path = os.path.join(folder_name, filename)
    neuron_data = pd.read_csv(csv_path)

    # Calculate the number of top neurons to select
    top_count = int(len(neuron_data) * (percentage / 100))

    # Select the top neurons based on their activation difference
    top_neurons = neuron_data.iloc[:top_count]["Neuron_Index"].values
    print(f"Loaded top {percentage}% neurons ({top_count} neurons) for muting.")
    return top_neurons


def classify_with_alexnet(model, activations):
    """
    Classify images using the original AlexNet classifier on the fc2 activations.
    """
    predictions = []
    for activation in activations:
        # Convert numpy activation to tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)
        relu_output = model.classifier[5](activation_tensor)  # Apply ReLU
        output = model.classifier[6](relu_output)  # Apply fc3
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions

# Function to calculate accuracy per group
def calculate_group_accuracy(predictions, true_labels):
    return accuracy_score(true_labels, predictions)


def main():

    seeds = [1, 11, 111, 1111, 11111, 111111, 1111111]

    # Paths to models, activations, and autoencoders for each seed
    model_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_{seed}.pt"
        for seed in seeds
    ]
    autoencoder_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/dynamic_left_patch/dlp_autoencoder_layer_fc2_seed_{seed}.pth"
        for seed in seeds
    ]
    test_activation_two_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_two_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]


    # Output folder for results
    folder_name = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/difference_analysis/dynamic_left_patch"
    os.makedirs(folder_name, exist_ok=True)

    # Prepare a DataFrame to store results
    results_df = pd.DataFrame(columns=[
        "Seed", "Percentage",
        "Val_Patch_Before", "Val_NoPatch_Before",
        "Val_Patch_After", "Val_NoPatch_After",
        "Test_Two_Patch_Before", "Test_Two_NoPatch_Before",
        "Test_Two_Patch_After", "Test_Two_NoPatch_After",
        "Test_Zero_Patch_Before", "Test_Zero_NoPatch_Before",
        "Test_Zero_Patch_After", "Test_Zero_NoPatch_After",
        "Worst_Acc_Before", "Worst_Acc_After",
        "Avg_Acc_Before", "Avg_Acc_After"
    ])

    # Loop through seeds/models/autoencoders
    for seed_idx, seed in enumerate(seeds):
        print(f"\nProcessing Seed {seed}")

        # Load paths
        model_path = model_paths[seed_idx]
        sae_path = autoencoder_paths[seed_idx]
        val_patch_path = val_activation_patch_paths[seed_idx]
        val_no_patch_path = val_activation_no_patch_paths[seed_idx]
        test_two_patch_path = test_activation_two_patch_paths[seed_idx]
        test_two_no_patch_path = test_activation_two_no_patch_paths[seed_idx]
        test_zero_patch_path = test_activation_zero_patch_paths[seed_idx]
        test_zero_no_patch_path = test_activation_zero_no_patch_paths[seed_idx]

        # Load the model and autoencoder
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = load_model(model_path, device)
        autoencoder = load_autoencoder(device, sae_path)

        # Load pre-saved activations
        val_activations_patch = np.load(val_patch_path, allow_pickle=True)
        val_activations_no_patch = np.load(val_no_patch_path, allow_pickle=True)

        test_two_patch = np.load(test_two_patch_path, allow_pickle=True)
        test_two_no_patch = np.load(test_two_no_patch_path, allow_pickle=True)
        test_zero_patch = np.load(test_zero_patch_path, allow_pickle=True)
        test_zero_no_patch = np.load(test_zero_no_patch_path, allow_pickle=True)

        # Classify "before muting"
        predictions_val_patch_before = classify_with_alexnet(model, val_activations_patch)
        predictions_val_no_patch_before = classify_with_alexnet(model, val_activations_no_patch)
        accuracy_val_patch_before = calculate_group_accuracy(predictions_val_patch_before, [0] * len(predictions_val_patch_before))
        accuracy_val_no_patch_before = calculate_group_accuracy(predictions_val_no_patch_before, [0] * len(predictions_val_no_patch_before))

        predictions_test_two_patch_before = classify_with_alexnet(model, test_two_patch)
        predictions_test_two_no_patch_before = classify_with_alexnet(model, test_two_no_patch)
        predictions_test_zero_patch_before = classify_with_alexnet(model, test_zero_patch)
        predictions_test_zero_no_patch_before = classify_with_alexnet(model, test_zero_no_patch)

        accuracy_test_two_patch_before = calculate_group_accuracy(predictions_test_two_patch_before, [1] * len(predictions_test_two_patch_before))
        accuracy_test_two_no_patch_before = calculate_group_accuracy(predictions_test_two_no_patch_before, [1] * len(predictions_test_two_no_patch_before))
        accuracy_test_zero_patch_before = calculate_group_accuracy(predictions_test_zero_patch_before, [0] * len(predictions_test_zero_patch_before))
        accuracy_test_zero_no_patch_before = calculate_group_accuracy(predictions_test_zero_no_patch_before, [0] * len(predictions_test_zero_no_patch_before))

        # Project activations
        projected_val_patch = project_activations(autoencoder, val_activations_patch, device)
        projected_val_no_patch = project_activations(autoencoder, val_activations_no_patch, device)
        projected_two_test_patch = project_activations(autoencoder, test_two_patch, device)
        projected_two_test_no_patch = project_activations(autoencoder, test_two_no_patch, device)
        projected_zero_test_patch = project_activations(autoencoder, test_zero_patch, device)
        projected_zero_test_no_patch = project_activations(autoencoder, test_zero_no_patch, device)

        # Loop through percentages
        for percentage in range(0, 51):  # 1% to 15%
            print(f"Testing with {percentage}% muting...")

            # Calculate differences and load top neurons
            #abs_diff = np.abs(np.mean(projected_val_patch, axis=0) - np.mean(projected_val_no_patch, axis=0))
            #csv_path = save_all_neurons_to_csv(abs_diff, folder_name, filename=f"mnist_dlp_val_neuron_differences_seed_{seed}.csv")
            #top_neurons = load_top_neurons_from_csv(folder_name, f"mnist_dlp_val_neuron_differences_seed_{seed}.csv", percentage)

            # **Correlation-Based Neurons**
            correlation_csv_path = save_neurons_by_correlation(
                projected_val_patch, projected_val_no_patch, folder_name, filename=f"neurons_by_correlation_seed_{seed}_200.csv"
            )
            top_neurons_corr = load_top_neurons_from_csv(folder_name, f"neurons_by_correlation_seed_{seed}_200.csv", percentage)

            top_neurons = top_neurons_corr
            # Muting neurons
            projected_val_patch_muted = projected_val_patch.copy()
            projected_val_no_patch_muted = projected_val_no_patch.copy()
            projected_two_test_patch_muted = projected_two_test_patch.copy()
            projected_two_test_no_patch_muted = projected_two_test_no_patch.copy()
            projected_zero_test_patch_muted = projected_zero_test_patch.copy()
            projected_zero_test_no_patch_muted = projected_zero_test_no_patch.copy()

            projected_val_patch_muted[:, top_neurons] = 0
            projected_val_no_patch_muted[:, top_neurons] = 0
            projected_two_test_patch_muted[:, top_neurons] = 0
            projected_two_test_no_patch_muted[:, top_neurons] = 0
            projected_zero_test_patch_muted[:, top_neurons] = 0
            projected_zero_test_no_patch_muted[:, top_neurons] = 0

            # Decode and classify
            decoded_val_patch = autoencoder.decoder(torch.from_numpy(projected_val_patch_muted).to(device).float()).cpu().numpy()
            decoded_val_no_patch = autoencoder.decoder(torch.from_numpy(projected_val_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_no_patch_muted).to(device).float()).cpu().numpy()

            predictions_val_patch_after = classify_decoded_activations(model, decoded_val_patch)
            predictions_val_no_patch_after = classify_decoded_activations(model, decoded_val_no_patch)
            predictions_test_two_patch_after = classify_decoded_activations(model, decoded_two_test_patch)
            predictions_test_two_no_patch_after = classify_decoded_activations(model, decoded_two_test_no_patch)
            predictions_test_zero_patch_after = classify_decoded_activations(model, decoded_zero_test_patch)
            predictions_test_zero_no_patch_after = classify_decoded_activations(model, decoded_zero_test_no_patch)

            accuracy_val_patch_after = calculate_group_accuracy(predictions_val_patch_after, [0] * len(predictions_val_patch_after))
            accuracy_val_no_patch_after = calculate_group_accuracy(predictions_val_no_patch_after, [0] * len(predictions_val_no_patch_after))
            accuracy_test_two_patch_after = calculate_group_accuracy(predictions_test_two_patch_after, [1] * len(predictions_test_two_patch_after))
            accuracy_test_two_no_patch_after = calculate_group_accuracy(predictions_test_two_no_patch_after, [1] * len(predictions_test_two_no_patch_after))
            accuracy_test_zero_patch_after = calculate_group_accuracy(predictions_test_zero_patch_after, [0] * len(predictions_test_zero_patch_after))
            accuracy_test_zero_no_patch_after = calculate_group_accuracy(predictions_test_zero_no_patch_after, [0] * len(predictions_test_zero_no_patch_after))

            # Calculate worst and average group accuracies
            worst_acc_before = min(accuracy_test_two_patch_before, accuracy_test_two_no_patch_before)
            worst_acc_after = min(accuracy_test_two_patch_after, accuracy_test_two_no_patch_after)
            avg_acc_before = (accuracy_test_two_patch_before + accuracy_test_two_no_patch_before +
                              accuracy_test_zero_patch_before + accuracy_test_zero_no_patch_before) / 4
            avg_acc_after = (accuracy_test_two_patch_after + accuracy_test_two_no_patch_after +
                             accuracy_test_zero_patch_after + accuracy_test_zero_no_patch_after) / 4

            # Append results to DataFrame
            new_row = pd.DataFrame([{
                "Seed": seed,
                "Percentage": percentage,
                "Val_Patch_Before": accuracy_val_patch_before,
                "Val_NoPatch_Before": accuracy_val_no_patch_before,
                "Val_Patch_After": accuracy_val_patch_after,
                "Val_NoPatch_After": accuracy_val_no_patch_after,
                "Test_Two_Patch_Before": accuracy_test_two_patch_before,
                "Test_Two_NoPatch_Before": accuracy_test_two_no_patch_before,
                "Test_Two_Patch_After": accuracy_test_two_patch_after,
                "Test_Two_NoPatch_After": accuracy_test_two_no_patch_after,
                "Test_Zero_Patch_Before": accuracy_test_zero_patch_before,
                "Test_Zero_NoPatch_Before": accuracy_test_zero_no_patch_before,
                "Test_Zero_Patch_After": accuracy_test_zero_patch_after,
                "Test_Zero_NoPatch_After": accuracy_test_zero_no_patch_after,
                "Worst_Acc_Before": worst_acc_before,
                "Worst_Acc_After": worst_acc_after,
                "Avg_Acc_Before": avg_acc_before,
                "Avg_Acc_After": avg_acc_after
            }])

            # Save results to CSV
            results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_csv_path = os.path.join(folder_name, "dynamic_left_patch_100.csv")
    results_df.to_csv(results_csv_path, index=False)
    print(f"All results saved to {results_csv_path}")

if __name__ == "__main__":
    main()

###50

In [None]:
from sklearn.metrics import accuracy_score
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function 12 to classify decoded activations
def classify_decoded_activations(model, decoded_activations):
    """Classify decoded activations using the softmax layer of the model."""
    predictions = []
    for activation in decoded_activations:
        # Convert numpy activation to a tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)

        # Pass through classifier[5] (ReLU)
        relu_output = model.classifier[5](activation_tensor)

        # Pass through classifier[6] (final linear layer)
        output = model.classifier[6](relu_output)

        # Apply softmax and get the predicted class
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)

    return predictions

def save_neurons_by_correlation(val_patch_activations, val_no_patch_activations, folder_name, filename="neurons_by_correlation_50.csv"):
    os.makedirs(folder_name, exist_ok=True)

    # Number of neurons should be 16k
    num_neurons = val_patch_activations.shape[1]
    print(f"Number of val patch neurons: {num_neurons}")
    num_images = val_patch_activations.shape[0]
    print(f"Number of val patch images: {val_patch_activations.shape[0]}")

    # Create a binary label vector pp (1 for patch, 0 for no patch)
    pp = np.concatenate([np.ones(val_patch_activations.shape[0]), np.zeros(val_no_patch_activations.shape[0])])
    print(f"Number of pp labels: {len(pp)}")

    # array to store correlations
    correlations = np.zeros(num_neurons)

    for i in range(num_neurons):
        # Combine activations for neuron i from both datassets p and np
        act_i = np.concatenate([val_patch_activations[:, i], val_no_patch_activations[:, i]])

        # Compute correlation between pp and act_i
        if np.std(pp) > 0 and np.std(act_i) > 0:
            corr = np.corrcoef(pp, act_i)[0, 1]
        else:
            corr = 0  # Handle constant vectors

        correlations[i] = corr

    # As previous code
    # Handle NaN correlations (replace NaN with 0)
    correlations = np.nan_to_num(correlations)
    print('shape of correlations', correlations.shape)

    # Create a DataFrame with neuron indices and their correlations
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(num_neurons),
        "Correlation": correlations
    })

    # Sort by correlation in descending order
    neuron_data.sort_values(by="Correlation", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"Neuron correlations saved at: {csv_path}")

    return csv_path

def plot_correlation_histogram(original, projected, title):
    correlations = [
        np.corrcoef(original[:, i], projected[:, i])[0, 1]
        if np.std(original[:, i]) > 0 and np.std(projected[:, i]) > 0
        else 0  # Handle constant features
        for i in range(original.shape[1])
    ]
    correlations = np.nan_to_num(correlations)  # Replace NaNs with 0
    plt.hist(correlations, bins=50, alpha=0.7)
    plt.title(title)
    plt.xlabel("Correlation")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()


# Project activations into sparse space
def project_activations(autoencoder, activations, device):
    print("Projecting Alexnet activations into SAE sparse space...")
    with torch.no_grad():
        projected = autoencoder.encoder(torch.from_numpy(activations).to(device).float())
    return projected.cpu().numpy()

# Save all neuron differences and indexes in descending order
def save_all_neurons_to_csv(abs_diff, folder_name, filename="all_neuron_differences_50.csv"):
    """
    Save all neuron indexes sorted by their differences (descending order) to a CSV file.
    """
    print(f"Saving all neuron differences to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

    return csv_path # return the csv_path



def save_top_neurons_to_csv(abs_diff, top_neurons, folder_name, filename="dlp_top_neurons_50.csv"):
    """
    Save the top neurons with their difference values to a CSV file.
    """
    print(f"Saving top neurons to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": range(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Mark whether each neuron is in the top 10%
    neuron_data["Selected_for_Muting"] = neuron_data["Neuron_Index"].isin(top_neurons)

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

# Function to load top neurons from CSV based on a percentage
def load_top_neurons_from_csv(folder_name, filename, percentage):
    """
    Load top neurons based on the specified percentage from the saved CSV file.
    """
    csv_path = os.path.join(folder_name, filename)
    neuron_data = pd.read_csv(csv_path)

    # Calculate the number of top neurons to select
    top_count = int(len(neuron_data) * (percentage / 100))

    # Select the top neurons based on their activation difference
    top_neurons = neuron_data.iloc[:top_count]["Neuron_Index"].values
    print(f"Loaded top {percentage}% neurons ({top_count} neurons) for muting.")
    return top_neurons


def classify_with_alexnet(model, activations):
    """
    Classify images using the original AlexNet classifier on the fc2 activations.
    """
    predictions = []
    for activation in activations:
        # Convert numpy activation to tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)
        relu_output = model.classifier[5](activation_tensor)  # Apply ReLU
        output = model.classifier[6](relu_output)  # Apply fc3
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions

# Function to calculate accuracy per group
def calculate_group_accuracy(predictions, true_labels):
    return accuracy_score(true_labels, predictions)


def main():

    seeds = [1, 11, 111, 1111, 11111, 111111, 1111111]

    # Paths to models, activations, and autoencoders for each seed
    model_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_{seed}.pt"
        for seed in seeds
    ]
    autoencoder_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/dynamic_left_patch/dlp_autoencoder_layer_fc2_seed_{seed}.pth"
        for seed in seeds
    ]
    test_activation_two_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_two_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_dlp_50_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_org_50_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]


    # Output folder for results
    folder_name = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/difference_analysis/dynamic_left_patch"
    os.makedirs(folder_name, exist_ok=True)

    # Prepare a DataFrame to store results
    results_df = pd.DataFrame(columns=[
        "Seed", "Percentage",
        "Val_Patch_Before", "Val_NoPatch_Before",
        "Val_Patch_After", "Val_NoPatch_After",
        "Test_Two_Patch_Before", "Test_Two_NoPatch_Before",
        "Test_Two_Patch_After", "Test_Two_NoPatch_After",
        "Test_Zero_Patch_Before", "Test_Zero_NoPatch_Before",
        "Test_Zero_Patch_After", "Test_Zero_NoPatch_After",
        "Worst_Acc_Before", "Worst_Acc_After",
        "Avg_Acc_Before", "Avg_Acc_After"
    ])

    # Loop through seeds/models/autoencoders
    for seed_idx, seed in enumerate(seeds):
        print(f"\nProcessing Seed {seed}")

        # Load paths
        model_path = model_paths[seed_idx]
        sae_path = autoencoder_paths[seed_idx]
        val_patch_path = val_activation_patch_paths[seed_idx]
        val_no_patch_path = val_activation_no_patch_paths[seed_idx]
        test_two_patch_path = test_activation_two_patch_paths[seed_idx]
        test_two_no_patch_path = test_activation_two_no_patch_paths[seed_idx]
        test_zero_patch_path = test_activation_zero_patch_paths[seed_idx]
        test_zero_no_patch_path = test_activation_zero_no_patch_paths[seed_idx]

        # Load the model and autoencoder
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = load_model(model_path, device)
        autoencoder = load_autoencoder(device, sae_path)

        # Load pre-saved activations
        val_activations_patch = np.load(val_patch_path, allow_pickle=True)
        val_activations_no_patch = np.load(val_no_patch_path, allow_pickle=True)

        test_two_patch = np.load(test_two_patch_path, allow_pickle=True)
        test_two_no_patch = np.load(test_two_no_patch_path, allow_pickle=True)
        test_zero_patch = np.load(test_zero_patch_path, allow_pickle=True)
        test_zero_no_patch = np.load(test_zero_no_patch_path, allow_pickle=True)

        # Classify "before muting"
        predictions_val_patch_before = classify_with_alexnet(model, val_activations_patch)
        predictions_val_no_patch_before = classify_with_alexnet(model, val_activations_no_patch)
        accuracy_val_patch_before = calculate_group_accuracy(predictions_val_patch_before, [0] * len(predictions_val_patch_before))
        accuracy_val_no_patch_before = calculate_group_accuracy(predictions_val_no_patch_before, [0] * len(predictions_val_no_patch_before))

        predictions_test_two_patch_before = classify_with_alexnet(model, test_two_patch)
        predictions_test_two_no_patch_before = classify_with_alexnet(model, test_two_no_patch)
        predictions_test_zero_patch_before = classify_with_alexnet(model, test_zero_patch)
        predictions_test_zero_no_patch_before = classify_with_alexnet(model, test_zero_no_patch)

        accuracy_test_two_patch_before = calculate_group_accuracy(predictions_test_two_patch_before, [1] * len(predictions_test_two_patch_before))
        accuracy_test_two_no_patch_before = calculate_group_accuracy(predictions_test_two_no_patch_before, [1] * len(predictions_test_two_no_patch_before))
        accuracy_test_zero_patch_before = calculate_group_accuracy(predictions_test_zero_patch_before, [0] * len(predictions_test_zero_patch_before))
        accuracy_test_zero_no_patch_before = calculate_group_accuracy(predictions_test_zero_no_patch_before, [0] * len(predictions_test_zero_no_patch_before))

        # Project activations
        projected_val_patch = project_activations(autoencoder, val_activations_patch, device)
        projected_val_no_patch = project_activations(autoencoder, val_activations_no_patch, device)
        projected_two_test_patch = project_activations(autoencoder, test_two_patch, device)
        projected_two_test_no_patch = project_activations(autoencoder, test_two_no_patch, device)
        projected_zero_test_patch = project_activations(autoencoder, test_zero_patch, device)
        projected_zero_test_no_patch = project_activations(autoencoder, test_zero_no_patch, device)

        # Loop through percentages
        for percentage in range(0, 51):  # 1% to 15%
            print(f"Testing with {percentage}% muting...")

            # Calculate differences and load top neurons
            #abs_diff = np.abs(np.mean(projected_val_patch, axis=0) - np.mean(projected_val_no_patch, axis=0))
            #csv_path = save_all_neurons_to_csv(abs_diff, folder_name, filename=f"mnist_dlp_val_neuron_differences_seed_{seed}.csv")
            #top_neurons = load_top_neurons_from_csv(folder_name, f"mnist_dlp_val_neuron_differences_seed_{seed}.csv", percentage)

            # **Correlation-Based Neurons**
            correlation_csv_path = save_neurons_by_correlation(
                projected_val_patch, projected_val_no_patch, folder_name, filename=f"neurons_by_correlation_seed_{seed}_50.csv"
            )
            top_neurons_corr = load_top_neurons_from_csv(folder_name, f"neurons_by_correlation_seed_{seed}_50.csv", percentage)

            top_neurons = top_neurons_corr
            # Muting neurons
            projected_val_patch_muted = projected_val_patch.copy()
            projected_val_no_patch_muted = projected_val_no_patch.copy()
            projected_two_test_patch_muted = projected_two_test_patch.copy()
            projected_two_test_no_patch_muted = projected_two_test_no_patch.copy()
            projected_zero_test_patch_muted = projected_zero_test_patch.copy()
            projected_zero_test_no_patch_muted = projected_zero_test_no_patch.copy()

            projected_val_patch_muted[:, top_neurons] = 0
            projected_val_no_patch_muted[:, top_neurons] = 0
            projected_two_test_patch_muted[:, top_neurons] = 0
            projected_two_test_no_patch_muted[:, top_neurons] = 0
            projected_zero_test_patch_muted[:, top_neurons] = 0
            projected_zero_test_no_patch_muted[:, top_neurons] = 0

            # Decode and classify
            decoded_val_patch = autoencoder.decoder(torch.from_numpy(projected_val_patch_muted).to(device).float()).cpu().numpy()
            decoded_val_no_patch = autoencoder.decoder(torch.from_numpy(projected_val_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_no_patch_muted).to(device).float()).cpu().numpy()

            predictions_val_patch_after = classify_decoded_activations(model, decoded_val_patch)
            predictions_val_no_patch_after = classify_decoded_activations(model, decoded_val_no_patch)
            predictions_test_two_patch_after = classify_decoded_activations(model, decoded_two_test_patch)
            predictions_test_two_no_patch_after = classify_decoded_activations(model, decoded_two_test_no_patch)
            predictions_test_zero_patch_after = classify_decoded_activations(model, decoded_zero_test_patch)
            predictions_test_zero_no_patch_after = classify_decoded_activations(model, decoded_zero_test_no_patch)

            accuracy_val_patch_after = calculate_group_accuracy(predictions_val_patch_after, [0] * len(predictions_val_patch_after))
            accuracy_val_no_patch_after = calculate_group_accuracy(predictions_val_no_patch_after, [0] * len(predictions_val_no_patch_after))
            accuracy_test_two_patch_after = calculate_group_accuracy(predictions_test_two_patch_after, [1] * len(predictions_test_two_patch_after))
            accuracy_test_two_no_patch_after = calculate_group_accuracy(predictions_test_two_no_patch_after, [1] * len(predictions_test_two_no_patch_after))
            accuracy_test_zero_patch_after = calculate_group_accuracy(predictions_test_zero_patch_after, [0] * len(predictions_test_zero_patch_after))
            accuracy_test_zero_no_patch_after = calculate_group_accuracy(predictions_test_zero_no_patch_after, [0] * len(predictions_test_zero_no_patch_after))

            # Calculate worst and average group accuracies
            worst_acc_before = min(accuracy_test_two_patch_before, accuracy_test_two_no_patch_before)
            worst_acc_after = min(accuracy_test_two_patch_after, accuracy_test_two_no_patch_after)
            avg_acc_before = (accuracy_test_two_patch_before + accuracy_test_two_no_patch_before +
                              accuracy_test_zero_patch_before + accuracy_test_zero_no_patch_before) / 4
            avg_acc_after = (accuracy_test_two_patch_after + accuracy_test_two_no_patch_after +
                             accuracy_test_zero_patch_after + accuracy_test_zero_no_patch_after) / 4

            # Append results to DataFrame
            new_row = pd.DataFrame([{
                "Seed": seed,
                "Percentage": percentage,
                "Val_Patch_Before": accuracy_val_patch_before,
                "Val_NoPatch_Before": accuracy_val_no_patch_before,
                "Val_Patch_After": accuracy_val_patch_after,
                "Val_NoPatch_After": accuracy_val_no_patch_after,
                "Test_Two_Patch_Before": accuracy_test_two_patch_before,
                "Test_Two_NoPatch_Before": accuracy_test_two_no_patch_before,
                "Test_Two_Patch_After": accuracy_test_two_patch_after,
                "Test_Two_NoPatch_After": accuracy_test_two_no_patch_after,
                "Test_Zero_Patch_Before": accuracy_test_zero_patch_before,
                "Test_Zero_NoPatch_Before": accuracy_test_zero_no_patch_before,
                "Test_Zero_Patch_After": accuracy_test_zero_patch_after,
                "Test_Zero_NoPatch_After": accuracy_test_zero_no_patch_after,
                "Worst_Acc_Before": worst_acc_before,
                "Worst_Acc_After": worst_acc_after,
                "Avg_Acc_Before": avg_acc_before,
                "Avg_Acc_After": avg_acc_after
            }])

            # Save results to CSV
            results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_csv_path = os.path.join(folder_name, "dynamic_left_patch_50.csv")
    results_df.to_csv(results_csv_path, index=False)
    print(f"All results saved to {results_csv_path}")

if __name__ == "__main__":
    main()

### 25

In [None]:
from sklearn.metrics import accuracy_score
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function 12 to classify decoded activations
def classify_decoded_activations(model, decoded_activations):
    """Classify decoded activations using the softmax layer of the model."""
    predictions = []
    for activation in decoded_activations:
        # Convert numpy activation to a tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)

        # Pass through classifier[5] (ReLU)
        relu_output = model.classifier[5](activation_tensor)

        # Pass through classifier[6] (final linear layer)
        output = model.classifier[6](relu_output)

        # Apply softmax and get the predicted class
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)

    return predictions

def save_neurons_by_correlation(val_patch_activations, val_no_patch_activations, folder_name, filename="neurons_by_correlation_25.csv"):
    os.makedirs(folder_name, exist_ok=True)

    # Number of neurons should be 16k
    num_neurons = val_patch_activations.shape[1]
    print(f"Number of val patch neurons: {num_neurons}")
    num_images = val_patch_activations.shape[0]
    print(f"Number of val patch images: {val_patch_activations.shape[0]}")

    # Create a binary label vector pp (1 for patch, 0 for no patch)
    pp = np.concatenate([np.ones(val_patch_activations.shape[0]), np.zeros(val_no_patch_activations.shape[0])])
    print(f"Number of pp labels: {len(pp)}")

    # array to store correlations
    correlations = np.zeros(num_neurons)

    for i in range(num_neurons):
        # Combine activations for neuron i from both datassets p and np
        act_i = np.concatenate([val_patch_activations[:, i], val_no_patch_activations[:, i]])

        # Compute correlation between pp and act_i
        if np.std(pp) > 0 and np.std(act_i) > 0:
            corr = np.corrcoef(pp, act_i)[0, 1]
        else:
            corr = 0  # Handle constant vectors

        correlations[i] = corr

    # As previous code
    # Handle NaN correlations (replace NaN with 0)
    correlations = np.nan_to_num(correlations)
    print('shape of correlations', correlations.shape)

    # Create a DataFrame with neuron indices and their correlations
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(num_neurons),
        "Correlation": correlations
    })

    # Sort by correlation in descending order
    neuron_data.sort_values(by="Correlation", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"Neuron correlations saved at: {csv_path}")

    return csv_path

def plot_correlation_histogram(original, projected, title):
    correlations = [
        np.corrcoef(original[:, i], projected[:, i])[0, 1]
        if np.std(original[:, i]) > 0 and np.std(projected[:, i]) > 0
        else 0  # Handle constant features
        for i in range(original.shape[1])
    ]
    correlations = np.nan_to_num(correlations)  # Replace NaNs with 0
    plt.hist(correlations, bins=50, alpha=0.7)
    plt.title(title)
    plt.xlabel("Correlation")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()


# Project activations into sparse space
def project_activations(autoencoder, activations, device):
    print("Projecting Alexnet activations into SAE sparse space...")
    with torch.no_grad():
        projected = autoencoder.encoder(torch.from_numpy(activations).to(device).float())
    return projected.cpu().numpy()

# Save all neuron differences and indexes in descending order
def save_all_neurons_to_csv(abs_diff, folder_name, filename="all_neuron_differences_25.csv"):
    """
    Save all neuron indexes sorted by their differences (descending order) to a CSV file.
    """
    print(f"Saving all neuron differences to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

    return csv_path # return the csv_path



def save_top_neurons_to_csv(abs_diff, top_neurons, folder_name, filename="dlp_top_neurons_25.csv"):
    """
    Save the top neurons with their difference values to a CSV file.
    """
    print(f"Saving top neurons to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": range(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Mark whether each neuron is in the top 10%
    neuron_data["Selected_for_Muting"] = neuron_data["Neuron_Index"].isin(top_neurons)

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

# Function to load top neurons from CSV based on a percentage
def load_top_neurons_from_csv(folder_name, filename, percentage):
    """
    Load top neurons based on the specified percentage from the saved CSV file.
    """
    csv_path = os.path.join(folder_name, filename)
    neuron_data = pd.read_csv(csv_path)

    # Calculate the number of top neurons to select
    top_count = int(len(neuron_data) * (percentage / 100))

    # Select the top neurons based on their activation difference
    top_neurons = neuron_data.iloc[:top_count]["Neuron_Index"].values
    print(f"Loaded top {percentage}% neurons ({top_count} neurons) for muting.")
    return top_neurons


def classify_with_alexnet(model, activations):
    """
    Classify images using the original AlexNet classifier on the fc2 activations.
    """
    predictions = []
    for activation in activations:
        # Convert numpy activation to tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)
        relu_output = model.classifier[5](activation_tensor)  # Apply ReLU
        output = model.classifier[6](relu_output)  # Apply fc3
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions

# Function to calculate accuracy per group
def calculate_group_accuracy(predictions, true_labels):
    return accuracy_score(true_labels, predictions)


def main():

    seeds = [1, 11, 111, 1111, 11111, 111111, 1111111]

    # Paths to models, activations, and autoencoders for each seed
    model_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_{seed}.pt"
        for seed in seeds
    ]
    autoencoder_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/dynamic_left_patch/dlp_autoencoder_layer_fc2_seed_{seed}.pth"
        for seed in seeds
    ]
    test_activation_two_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_two_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_two_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_dlp_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_test_zero_org_100_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_dlp_25_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/dynamic_left_patch/fc2_activations_val_zero_org_25_alexnet_mnist_dlp_seed_{seed}.npy"
        for seed in seeds
    ]


    # Output folder for results
    folder_name = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/difference_analysis/dynamic_left_patch"
    os.makedirs(folder_name, exist_ok=True)

    # Prepare a DataFrame to store results
    results_df = pd.DataFrame(columns=[
        "Seed", "Percentage",
        "Val_Patch_Before", "Val_NoPatch_Before",
        "Val_Patch_After", "Val_NoPatch_After",
        "Test_Two_Patch_Before", "Test_Two_NoPatch_Before",
        "Test_Two_Patch_After", "Test_Two_NoPatch_After",
        "Test_Zero_Patch_Before", "Test_Zero_NoPatch_Before",
        "Test_Zero_Patch_After", "Test_Zero_NoPatch_After",
        "Worst_Acc_Before", "Worst_Acc_After",
        "Avg_Acc_Before", "Avg_Acc_After"
    ])

    # Loop through seeds/models/autoencoders
    for seed_idx, seed in enumerate(seeds):
        print(f"\nProcessing Seed {seed}")

        # Load paths
        model_path = model_paths[seed_idx]
        sae_path = autoencoder_paths[seed_idx]
        val_patch_path = val_activation_patch_paths[seed_idx]
        val_no_patch_path = val_activation_no_patch_paths[seed_idx]
        test_two_patch_path = test_activation_two_patch_paths[seed_idx]
        test_two_no_patch_path = test_activation_two_no_patch_paths[seed_idx]
        test_zero_patch_path = test_activation_zero_patch_paths[seed_idx]
        test_zero_no_patch_path = test_activation_zero_no_patch_paths[seed_idx]

        # Load the model and autoencoder
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = load_model(model_path, device)
        autoencoder = load_autoencoder(device, sae_path)

        # Load pre-saved activations
        val_activations_patch = np.load(val_patch_path, allow_pickle=True)
        val_activations_no_patch = np.load(val_no_patch_path, allow_pickle=True)

        test_two_patch = np.load(test_two_patch_path, allow_pickle=True)
        test_two_no_patch = np.load(test_two_no_patch_path, allow_pickle=True)
        test_zero_patch = np.load(test_zero_patch_path, allow_pickle=True)
        test_zero_no_patch = np.load(test_zero_no_patch_path, allow_pickle=True)

        # Classify "before muting"
        predictions_val_patch_before = classify_with_alexnet(model, val_activations_patch)
        predictions_val_no_patch_before = classify_with_alexnet(model, val_activations_no_patch)
        accuracy_val_patch_before = calculate_group_accuracy(predictions_val_patch_before, [0] * len(predictions_val_patch_before))
        accuracy_val_no_patch_before = calculate_group_accuracy(predictions_val_no_patch_before, [0] * len(predictions_val_no_patch_before))

        predictions_test_two_patch_before = classify_with_alexnet(model, test_two_patch)
        predictions_test_two_no_patch_before = classify_with_alexnet(model, test_two_no_patch)
        predictions_test_zero_patch_before = classify_with_alexnet(model, test_zero_patch)
        predictions_test_zero_no_patch_before = classify_with_alexnet(model, test_zero_no_patch)

        accuracy_test_two_patch_before = calculate_group_accuracy(predictions_test_two_patch_before, [1] * len(predictions_test_two_patch_before))
        accuracy_test_two_no_patch_before = calculate_group_accuracy(predictions_test_two_no_patch_before, [1] * len(predictions_test_two_no_patch_before))
        accuracy_test_zero_patch_before = calculate_group_accuracy(predictions_test_zero_patch_before, [0] * len(predictions_test_zero_patch_before))
        accuracy_test_zero_no_patch_before = calculate_group_accuracy(predictions_test_zero_no_patch_before, [0] * len(predictions_test_zero_no_patch_before))

        # Project activations
        projected_val_patch = project_activations(autoencoder, val_activations_patch, device)
        projected_val_no_patch = project_activations(autoencoder, val_activations_no_patch, device)
        projected_two_test_patch = project_activations(autoencoder, test_two_patch, device)
        projected_two_test_no_patch = project_activations(autoencoder, test_two_no_patch, device)
        projected_zero_test_patch = project_activations(autoencoder, test_zero_patch, device)
        projected_zero_test_no_patch = project_activations(autoencoder, test_zero_no_patch, device)

        # Loop through percentages
        for percentage in range(0, 51):  # 1% to 15%
            print(f"Testing with {percentage}% muting...")

            # Calculate differences and load top neurons
            #abs_diff = np.abs(np.mean(projected_val_patch, axis=0) - np.mean(projected_val_no_patch, axis=0))
            #csv_path = save_all_neurons_to_csv(abs_diff, folder_name, filename=f"mnist_dlp_val_neuron_differences_seed_{seed}.csv")
            #top_neurons = load_top_neurons_from_csv(folder_name, f"mnist_dlp_val_neuron_differences_seed_{seed}.csv", percentage)

            # **Correlation-Based Neurons**
            correlation_csv_path = save_neurons_by_correlation(
                projected_val_patch, projected_val_no_patch, folder_name, filename=f"neurons_by_correlation_seed_{seed}_25.csv"
            )
            top_neurons_corr = load_top_neurons_from_csv(folder_name, f"neurons_by_correlation_seed_{seed}_25.csv", percentage)

            top_neurons = top_neurons_corr
            # Muting neurons
            projected_val_patch_muted = projected_val_patch.copy()
            projected_val_no_patch_muted = projected_val_no_patch.copy()
            projected_two_test_patch_muted = projected_two_test_patch.copy()
            projected_two_test_no_patch_muted = projected_two_test_no_patch.copy()
            projected_zero_test_patch_muted = projected_zero_test_patch.copy()
            projected_zero_test_no_patch_muted = projected_zero_test_no_patch.copy()

            projected_val_patch_muted[:, top_neurons] = 0
            projected_val_no_patch_muted[:, top_neurons] = 0
            projected_two_test_patch_muted[:, top_neurons] = 0
            projected_two_test_no_patch_muted[:, top_neurons] = 0
            projected_zero_test_patch_muted[:, top_neurons] = 0
            projected_zero_test_no_patch_muted[:, top_neurons] = 0

            # Decode and classify
            decoded_val_patch = autoencoder.decoder(torch.from_numpy(projected_val_patch_muted).to(device).float()).cpu().numpy()
            decoded_val_no_patch = autoencoder.decoder(torch.from_numpy(projected_val_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_two_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_no_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_patch_muted).to(device).float()).cpu().numpy()
            decoded_zero_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_no_patch_muted).to(device).float()).cpu().numpy()

            predictions_val_patch_after = classify_decoded_activations(model, decoded_val_patch)
            predictions_val_no_patch_after = classify_decoded_activations(model, decoded_val_no_patch)
            predictions_test_two_patch_after = classify_decoded_activations(model, decoded_two_test_patch)
            predictions_test_two_no_patch_after = classify_decoded_activations(model, decoded_two_test_no_patch)
            predictions_test_zero_patch_after = classify_decoded_activations(model, decoded_zero_test_patch)
            predictions_test_zero_no_patch_after = classify_decoded_activations(model, decoded_zero_test_no_patch)

            accuracy_val_patch_after = calculate_group_accuracy(predictions_val_patch_after, [0] * len(predictions_val_patch_after))
            accuracy_val_no_patch_after = calculate_group_accuracy(predictions_val_no_patch_after, [0] * len(predictions_val_no_patch_after))
            accuracy_test_two_patch_after = calculate_group_accuracy(predictions_test_two_patch_after, [1] * len(predictions_test_two_patch_after))
            accuracy_test_two_no_patch_after = calculate_group_accuracy(predictions_test_two_no_patch_after, [1] * len(predictions_test_two_no_patch_after))
            accuracy_test_zero_patch_after = calculate_group_accuracy(predictions_test_zero_patch_after, [0] * len(predictions_test_zero_patch_after))
            accuracy_test_zero_no_patch_after = calculate_group_accuracy(predictions_test_zero_no_patch_after, [0] * len(predictions_test_zero_no_patch_after))

            # Calculate worst and average group accuracies
            worst_acc_before = min(accuracy_test_two_patch_before, accuracy_test_two_no_patch_before)
            worst_acc_after = min(accuracy_test_two_patch_after, accuracy_test_two_no_patch_after)
            avg_acc_before = (accuracy_test_two_patch_before + accuracy_test_two_no_patch_before +
                              accuracy_test_zero_patch_before + accuracy_test_zero_no_patch_before) / 4
            avg_acc_after = (accuracy_test_two_patch_after + accuracy_test_two_no_patch_after +
                             accuracy_test_zero_patch_after + accuracy_test_zero_no_patch_after) / 4

            # Append results to DataFrame
            new_row = pd.DataFrame([{
                "Seed": seed,
                "Percentage": percentage,
                "Val_Patch_Before": accuracy_val_patch_before,
                "Val_NoPatch_Before": accuracy_val_no_patch_before,
                "Val_Patch_After": accuracy_val_patch_after,
                "Val_NoPatch_After": accuracy_val_no_patch_after,
                "Test_Two_Patch_Before": accuracy_test_two_patch_before,
                "Test_Two_NoPatch_Before": accuracy_test_two_no_patch_before,
                "Test_Two_Patch_After": accuracy_test_two_patch_after,
                "Test_Two_NoPatch_After": accuracy_test_two_no_patch_after,
                "Test_Zero_Patch_Before": accuracy_test_zero_patch_before,
                "Test_Zero_NoPatch_Before": accuracy_test_zero_no_patch_before,
                "Test_Zero_Patch_After": accuracy_test_zero_patch_after,
                "Test_Zero_NoPatch_After": accuracy_test_zero_no_patch_after,
                "Worst_Acc_Before": worst_acc_before,
                "Worst_Acc_After": worst_acc_after,
                "Avg_Acc_Before": avg_acc_before,
                "Avg_Acc_After": avg_acc_after
            }])

            # Save results to CSV
            results_df = pd.concat([results_df, new_row], ignore_index=True)
    results_csv_path = os.path.join(folder_name, "dynamic_left_patch_25.csv")
    results_df.to_csv(results_csv_path, index=False)
    print(f"All results saved to {results_csv_path}")

if __name__ == "__main__":
    main()

In [None]:
stophere

## Archive

In [None]:
'''
from sklearn.metrics import accuracy_score
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function 12 to classify decoded activations
def classify_decoded_activations(model, decoded_activations):
    """Classify decoded activations using the softmax layer of the model."""
    predictions = []
    for activation in decoded_activations:
        # Convert numpy activation to a tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)

        # Pass through classifier[5] (ReLU)
        relu_output = model.classifier[5](activation_tensor)

        # Pass through classifier[6] (final linear layer)
        output = model.classifier[6](relu_output)

        # Apply softmax and get the predicted class
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)

    return predictions


# Project activations into sparse space
def project_activations(autoencoder, activations, device):
    print("Projecting Alexnet activations into SAE sparse space...")
    with torch.no_grad():
        projected = autoencoder.encoder(torch.from_numpy(activations).to(device).float())
    return projected.cpu().numpy()

# Save all neuron differences and indexes in descending order
def save_all_neurons_to_csv(abs_diff, folder_name, filename="all_neuron_differences.csv"):
    """
    Save all neuron indexes sorted by their differences (descending order) to a CSV file.
    """
    print(f"Saving all neuron differences to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")


def save_top_neurons_to_csv(abs_diff, top_neurons, folder_name, filename="dlp_top_neurons.csv"):
    """
    Save the top neurons with their difference values to a CSV file.
    """
    print(f"Saving top neurons to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": range(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Mark whether each neuron is in the top 10%
    neuron_data["Selected_for_Muting"] = neuron_data["Neuron_Index"].isin(top_neurons)

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

# Function to load top neurons from CSV
def load_top_neurons_from_csv(folder_name, filename):
    """
    Load top neurons from the saved CSV file.
    """
    csv_path = os.path.join(folder_name, filename)
    neuron_data = pd.read_csv(csv_path)
    top_neurons = neuron_data["Neuron_Index"].values
    return top_neurons


def classify_with_alexnet(model, activations):
    """
    Classify images using the original AlexNet classifier on the fc2 activations.
    """
    predictions = []
    for activation in activations:
        # Convert numpy activation to tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)
        relu_output = model.classifier[5](activation_tensor)  # Apply ReLU
        output = model.classifier[6](relu_output)  # Apply fc3
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions

# Function to calculate accuracy per group
def calculate_group_accuracy(predictions, true_labels):
    return accuracy_score(true_labels, predictions)



def main():

    seeds = [1, 11, 111]
    # Paths to models, activations, and autoencoders for each seed
    model_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_finetune_dlp_seed_1.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_finetune_dlp_seed_11.pt",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_finetune_dlp_seed_111.pt"
    ]
    autoencoder_paths = [
        '/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/wb_autoencoder_layer_17_seed_1_unnormalized.pth',
        '/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/wb_autoencoder_layer_17_seed_11_unnormalized.pth',
        '/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/wb_autoencoder_layer_17_seed_111_unnormalized.pth'
    ]
    test_activation_two_patch_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_two_dlp_alexnet_mnist_finetune_dlp_seed_1.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_two_dlp_alexnet_mnist_finetune_dlp_seed_11.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_two_dlp_alexnet_mnist_finetune_dlp_seed_111.npy"
    ]
    test_activation_two_no_patch_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_two_org_alexnet_mnist_finetune_dlp_seed_1.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_two_org_alexnet_mnist_finetune_dlp_seed_11.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_two_org_alexnet_mnist_finetune_dlp_seed_111.npy"
    ]

    test_activation_zero_patch_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_zero_dlp_alexnet_mnist_finetune_dlp_seed_1.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_zero_dlp_alexnet_mnist_finetune_dlp_seed_11.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_zero_dlp_alexnet_mnist_finetune_dlp_seed_111.npy"
    ]
    test_activation_zero_no_patch_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_zero_org_alexnet_mnist_finetune_dlp_seed_1.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_zero_org_alexnet_mnist_finetune_dlp_seed_11.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_test_zero_org_alexnet_mnist_finetune_dlp_seed_111.npy"
    ]
    val_activation_patch_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_val_two_dlp_alexnet_mnist_finetune_dlp_seed_1.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_val_two_dlp_alexnet_mnist_finetune_dlp_seed_11.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_val_two_dlp_alexnet_mnist_finetune_dlp_seed_111.npy"
    ]
    val_activation_no_patch_paths = [
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_val_two_org_alexnet_mnist_finetune_dlp_seed_1.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_val_two_org_alexnet_mnist_finetune_dlp_seed_11.npy",
        "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/fc2_activations_val_two_org_alexnet_mnist_finetune_dlp_seed_111.npy"
    ]

    # Output folder for results
    folder_name = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/difference_analysis"
    os.makedirs(folder_name, exist_ok=True)

    all_results = []

    # Loop through seeds/models/autoencoders
    for seed, (model_path, sae_path, val_patch_path, val_no_patch_path, test_patch_path, test_no_patch_path) in enumerate(
            zip(model_paths, autoencoder_paths, val_activation_patch_paths, val_activation_no_patch_paths,
                test_activation_patch_paths, test_activation_no_patch_paths), start=1):

        print(f"\nProcessing Seed {seed}")
        print(f"Model Path: {model_path}")
        print(f"Autoencoder Path: {sae_path}")
        print(f"Activation Patch Path: {val_patch_path}")
        print(f"Activation No Patch Path: {val_no_patch_path}")
        print(f"Test Activation Patch Path: {test_patch_path}")
        print(f"Test Activation No Patch Path: {test_no_patch_path}")

        # Load the model and autoencoder
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = load_model(model_path, device)
        autoencoder = load_autoencoder(device, sae_path)

        # Load pre-saved activations
        val_activations_patch = np.load(val_patch_path, allow_pickle=True)
        val_activations_no_patch = np.load(val_no_patch_path, allow_pickle=True)

        # Direct classification using AlexNet
        predictions_val_patch_alexnet = classify_with_alexnet(model, val_activations_patch)
        accuracy_val_patch_alexnet = accuracy_score([1] * len(predictions_val_patch_alexnet), predictions_val_patch_alexnet)

        predictions_val_no_patch_alexnet = classify_with_alexnet(model, val_activations_no_patch)
        accuracy_no_patch_alexnet = accuracy_score([1] * len(predictions_val_no_patch_alexnet), predictions_val_no_patch_alexnet)

        # Classification before muting
        predictions_val_patch_before = classify_with_alexnet(model, val_activations_patch)
        predictions_val_no_patch_before = classify_with_alexnet(model, val_activations_no_patch)

        # Calculate validation accuracies before muting
        accuracy_val_patch_before = calculate_group_accuracy(predictions_val_patch_before, [1] * len(predictions_val_patch_before))
        accuracy_val_no_patch_before = calculate_group_accuracy(predictions_val_no_patch_before, [1] * len(predictions_val_no_patch_before))



        # Project validation activations into sparse space
        projected_val_patch = project_activations(autoencoder, val_activations_patch, device)
        projected_val_no_patch = project_activations(autoencoder, val_activations_no_patch, device)

        # Calculate differences and save neuron indexes
        avg_val_patch = np.mean(projected_val_patch, axis=0)
        avg_val_no_patch = np.mean(projected_val_no_patch, axis=0)
        abs_diff = np.abs(avg_val_patch - avg_val_no_patch)
        save_all_neurons_to_csv(abs_diff, folder_name, filename=f"val_neuron_differences_seed_{seed}.csv")


        top_neurons = load_top_neurons_from_csv(folder_name, f"val_neuron_differences_seed_{seed}.csv")
        neurons_to_mute = top_neurons[:int(len(top_neurons) * 0.1)]
        projected_val_patch[:, neurons_to_mute] = 0
        projected_val_no_patch[:, neurons_to_mute] = 0

        decoded_val_patch = autoencoder.decoder(torch.from_numpy(projected_val_patch).to(device).float()).cpu().numpy()
        decoded_val_no_patch = autoencoder.decoder(torch.from_numpy(projected_val_no_patch).to(device).float()).cpu().numpy()

        predictions_val_patch_after = classify_decoded_activations(model, decoded_val_patch)
        predictions_val_no_patch_after = classify_decoded_activations(model, decoded_val_no_patch)

        accuracy_val_patch_after = calculate_group_accuracy(predictions_val_patch_after, [1] * len(predictions_val_patch_after))
        accuracy_val_no_patch_after = calculate_group_accuracy(predictions_val_no_patch_after, [1] * len(predictions_val_no_patch_after))

        # Print Validation Results
        print(f"Validation Accuracy (Patch, Before Muting): {accuracy_val_patch_before:.4f}")
        print(f"Validation Accuracy (No Patch, Before Muting): {accuracy_val_no_patch_before:.4f}")
        print(f"Validation Accuracy (Patch, After Muting): {accuracy_val_patch_after:.4f}")
        print(f"Validation Accuracy (No Patch, After Muting): {accuracy_val_no_patch_after:.4f}")

############################################################################################################################################
        # Test Phase
        test_two_patch = np.load(test_activation_two_patch_paths, allow_pickle=True)
        test_two_no_patch = np.load(test_activation_two_no_patch_paths, allow_pickle=True)
        test_zero_patch = np.load(test_activation_zero_patch_paths, allow_pickle=True)
        test_zero_no_patch = np.load(test_activation_zero_no_patch_paths, allow_pickle=True)



        # Test classification before muting
        predictions_test_two_patch_before = classify_with_alexnet(model, test_two_patch)
        predictions_test_two_no_patch_before = classify_with_alexnet(model, test_two_no_patch)
        predictions_test_zero_patch_before = classify_with_alexnet(model, test_zero_patch)
        predictions_test_zero_no_patch_before = classify_with_alexnet(model, test_zero_no_patch)



        accuracy_test_two_patch_before = calculate_group_accuracy(predictions_test_two_patch_before, [1] * len(predictions_test_two_patch_before))
        accuracy_test_two_no_patch_before = calculate_group_accuracy(predictions_test_two_no_patch_before, [1] * len(predictions_test_two_no_patch_before))
        accuracy_test_zero_patch_before = calculate_group_accuracy(predictions_test_zero_patch_before, [0] * len(predictions_test_zero_patch_before))
        accuracy_test_zero_no_patch_before = calculate_group_accuracy(predictions_test_zero_no_patch_before, [0] * len(predictions_test_zero_no_patch_before))

        # Load top neurons from validation
        top_neurons = load_top_neurons_from_csv(folder_name, filename=f"val_neuron_differences_seed_{seed}.csv")

        # Project test activations into sparse space
        projected_two_test_patch = project_activations(autoencoder, test_two_patch, device)
        projected_two_test_no_patch = project_activations(autoencoder, test_two_no_patch, device)
        projected_zero_test_patch = project_activations(autoencoder, test_zero_patch, device)
        projected_zero_test_no_patch = project_activations(autoencoder, test_zero_no_patch, device)

        #top_neuron_count = int(len(abs_diff) * 0.1)
        #top_neurons = np.argsort(abs_diff)[-top_neuron_count:]

        # Muting neurons in test data
        top_neuron_count = int(len(top_neurons) * 0.1)  # Use top 10% neurons from validation
        neurons_to_mute = top_neurons[:top_neuron_count]
##???????????????????????????????????????
        projected_two_test_patch[:, neurons_to_mute] = 0
        projected_two_test_no_patch[:, neurons_to_mute] = 0
        projected_zero_test_patch[:, neurons_to_mute] = 0
        projected_zero_test_no_patch[:, neurons_to_mute] = 0


        # Decode and classify test data
        decoded_two_test_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_patch).to(device).float()).cpu().detach().numpy()
        decoded_two_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_no_patch).to(device).float()).cpu().detach().numpy()
        decoded_zero_test_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_patch).to(device).float()).cpu().detach().numpy()
        decoded_zero_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_no_patch).to(device).float()).cpu().detach().numpy()

        # Classification
        predictions_test_two_patch_after = classify_decoded_activations(model, decoded_two_test_patch)
        predictions_test_two_no_patch_after = classify_decoded_activations(model, decoded_two_test_no_patch)
        predictions_test_zero_patch_after = classify_decoded_activations(model, decoded_zero_test_patch)
        predictions_test_zero_no_patch_after = classify_decoded_activations(model, decoded_zero_test_no_patch)

        accuracy_test_two_patch_after = calculate_group_accuracy(predictions_test_two_patch_after, [1] * len(predictions_test_two_patch_after))
        accuracy_test_two_no_patch_after = calculate_group_accuracy(predictions_test_two_no_patch_after, [1] * len(predictions_test_two_no_patch_after))
        accuracy_test_zero_patch_after = calculate_group_accuracy(predictions_test_zero_patch_after, [0]* len(predictions_test_zero_patch_after))
        accuracy_test_zero_no_patch_after = calculate_group_accuracy(predictions_test_zero_no_patch_after, [0] * len(predictions_test_zero_no_patch_after))

        # Calculate Worst and Average Group Accuracies
        worst_group_accuracy_before = min(accuracy_test_two_patch_before, accuracy_test_two_no_patch_before)
        worst_group_accuracy_after = min(accuracy_test_two_patch_after, accuracy_test_two_no_patch_after)

        avg_group_accuracy_before = (accuracy_test_two_patch_before + accuracy_test_two_no_patch_before + accuracy_test_zero_patch_before, accuracy_test_zero_no_patch_before ) / 4
        avg_group_accuracy_after = (accuracy_test_two_patch_after + accuracy_test_two_no_patch_after + accuracy_test_zero_patch_after, accuracy_test_zero_no_patch_after) / 4

        # Print Test Results
        print(f"Test Accuracy (Patch, Before Muting): {accuracy_test_two_patch_before:.4f}")
        print(f"Test Accuracy (No Patch, Before Muting): {accuracy_test_two_no_patch_before:.4f}")
        print(f"Test Accuracy (Patch, After Muting): {accuracy_test_zero_patch_after:.4f}")
        print(f"Test Accuracy (No Patch, After Muting): {accuracy_test_zero_no_patch_after:.4f}")
        print(f"Worst Group Accuracy (Before Muting): {worst_group_accuracy_before:.4f}")
        print(f"Worst Group Accuracy (After Muting): {worst_group_accuracy_after:.4f}")
        print(f"Average Group Accuracy (Before Muting): {avg_group_accuracy_before:.4f}")
        print(f"Average Group Accuracy (After Muting): {avg_group_accuracy_after:.4f}")


if __name__ == "__main__":
    main()
'''

In [None]:

'''
def identify_patch_specific_neurons(avg_activations_patch, avg_activations_no_patch, patch_threshold=0.2, no_patch_threshold=0.05):
    """
    Identify neurons that are highly activated for patched images but not for non-patched ones.
    """
    print("Identifying neurons selectively activated by patches...")

    # Calculate activation differences without normalization
    high_patch_activation = avg_activations_patch > patch_threshold
    low_no_patch_activation = avg_activations_no_patch < no_patch_threshold

    # Select neurons that meet both criteria
    patch_specific_neurons = np.where(high_patch_activation & low_no_patch_activation)[0]

    # Debugging: Print some stats to understand what's happening
    print(f"Average activation for patch: {avg_activations_patch.mean():.4f}, No patch: {avg_activations_no_patch.mean():.4f}")
    print(f"Number of neurons with high activation for patch: {(high_patch_activation).sum()}")
    print(f"Number of neurons with low activation for no patch: {(low_no_patch_activation).sum()}")
    print(f"Found {len(patch_specific_neurons)} patch-specific neurons.")

    return patch_specific_neurons

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function 12 to classify decoded activations
def classify_decoded_activations(model, decoded_activations):
    """Classify decoded activations using the softmax layer of the model."""
    predictions = []
    for activation in decoded_activations:
        output = model.classifier[6](torch.from_numpy(activation).float().to(device))
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions


# Project activations into sparse space
def project_activations(autoencoder, activations, device):
    print("Projecting Alexnet activations into SAE sparse space...")
    with torch.no_grad():
        projected = autoencoder.encoder(torch.from_numpy(activations).to(device).float())
    return projected.cpu().numpy()



def save_top_neurons_to_csv(abs_diff, top_neurons, folder_name, filename="wb_top_neurons.csv"):
    """
    Save the top neurons with their difference values to a CSV file.
    """
    print(f"Saving top neurons to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": range(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Mark whether each neuron is in the top 10%
    neuron_data["Selected_for_Muting"] = neuron_data["Neuron_Index"].isin(top_neurons)

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

def classify_with_alexnet(model, activations):
    """
    Classify images using the original AlexNet classifier on the fc2 activations.
    """
    predictions = []
    for activation in activations:
        # Convert numpy activation to tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)
        relu_output = model.classifier[5](activation_tensor)  # Apply ReLU
        output = model.classifier[6](relu_output)  # Apply fc3
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions



def main():
    # Paths and initialization
    model_path = "/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/models/initial_classifier/alexnet_wbw_lbl_lbw_11train.pt"
    autoencoder_paths = {
        "1": '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/activations/Autoencoders/wb_autoencoder_layer_17_seed_1_unnormalized.pth',
        "11": '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/activations/Autoencoders/wb_autoencoder_layer_17_seed_11_unnormalized.pth',
        "111": '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/activations/Autoencoders/wb_autoencoder_layer_17_seed_111_unnormalized.pth'
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load the AlexNet model
    model = load_model(model_path, device)

    # Define paths to pre-saved activations
    activation_patch_path = "/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/activations/test_patch/wb_fc2_activations_patch.npy"
    activation_no_patch_path = "/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/activations/test_no_patch/wb_fc2_activations_no_patch.npy"

    # Load pre-saved activations
    print(f"Loading pre-saved AlexNet activations for fc2_activations_patch...")
    activations_patch = np.load(activation_patch_path, allow_pickle=True)
    print(f"Loading pre-saved AlexNet activations for fc2_activations_no_patch...")
    activations_no_patch = np.load(activation_no_patch_path, allow_pickle=True)

    # Loop through each autoencoder
    for seed, autoencoder_path in autoencoder_paths.items():
        print(f"\nProcessing with Sparse Autoencoder (Seed {seed})...")

        # Load the autoencoder
        autoencoder = load_autoencoder(autoencoder_path, device)

        # Project activations into sparse space
        projected_patch = project_activations(autoencoder, activations_patch, device)
        projected_no_patch = project_activations(autoencoder, activations_no_patch, device)

        # Decode the projected activations back to the original space
        decoded_patch = autoencoder.decoder(torch.from_numpy(projected_patch).to(device).float()).cpu().detach().numpy()
        decoded_no_patch = autoencoder.decoder(torch.from_numpy(projected_no_patch).to(device).float()).cpu().detach().numpy()

        # Calculate the absolute differences between patch and no patch
        avg_activations_patch = np.mean(projected_patch, axis=0)
        avg_activations_no_patch = np.mean(projected_no_patch, axis=0)
        abs_diff = np.abs(avg_activations_patch - avg_activations_no_patch)

        # Identify the top 10% neurons with the highest differences
        top_neuron_count = int(len(abs_diff) * 0.1)
        top_neurons = np.argsort(abs_diff)[-top_neuron_count:]

        # Classify 'wb_with_patch' without muting
        print("Classifying 'wb_with_patch' without muting neurons...")
        predictions_patch_without_muting = classify_decoded_activations(model, decoded_patch)
        accuracy_patch_without_muting = accuracy_score([1] * len(predictions_patch_without_muting), predictions_patch_without_muting)

        # Mute the top neurons for 'wb_with_patch' and classify
        projected_patch[:, top_neurons] = 0
        decoded_patch_muted = autoencoder.decoder(torch.from_numpy(projected_patch).to(device).float()).cpu().detach().numpy()
        predictions_patch_with_muting = classify_decoded_activations(model, decoded_patch_muted)
        accuracy_patch_with_muting = accuracy_score([1] * len(predictions_patch_with_muting), predictions_patch_with_muting)

        # Classify 'wb_no_patch' without muting
        print("Classifying 'wb_no_patch' without muting neurons...")
        predictions_no_patch_without_muting = classify_decoded_activations(model, decoded_no_patch)
        accuracy_no_patch_without_muting = accuracy_score([1] * len(predictions_no_patch_without_muting), predictions_no_patch_without_muting)

        # Mute the top neurons for 'wb_no_patch' and classify
        projected_no_patch[:, top_neurons] = 0
        decoded_no_patch_muted = autoencoder.decoder(torch.from_numpy(projected_no_patch).to(device).float()).cpu().detach().numpy()
        predictions_no_patch_with_muting = classify_decoded_activations(model, decoded_no_patch_muted)
        accuracy_no_patch_with_muting = accuracy_score([1] * len(predictions_no_patch_with_muting), predictions_no_patch_with_muting)

        # Save top neurons to CSV
        csv_folder = f"/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/activations/difference_analysis_seed_{seed}"
        Path(csv_folder).mkdir(parents=True, exist_ok=True)
        save_top_neurons_to_csv(abs_diff, top_neurons, csv_folder, filename=f"wb_top_neurons_seed_{seed}.csv")

        # Print the results for this autoencoder
        print(f"\nClassification Accuracy Results (Seed {seed}):")
        print(f"1. Accuracy (waterbird land without muting): {accuracy_patch_without_muting:.4f}")
        print(f"2. Accuracy (waterbird land with muting): {accuracy_patch_with_muting:.4f}")
        print(f"3. Accuracy (waterbird water without muting): {accuracy_no_patch_without_muting:.4f}")
        print(f"4. Accuracy (waterbird water with muting): {accuracy_no_patch_with_muting:.4f}")


if __name__ == "__main__":
    main()
'''

In [None]:
from sklearn.metrics import accuracy_score
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function 12 to classify decoded activations
def classify_decoded_activations(model, decoded_activations):
    """Classify decoded activations using the softmax layer of the model."""
    predictions = []
    for activation in decoded_activations:
        # Convert numpy activation to a tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)

        # Pass through classifier[5] (ReLU)
        relu_output = model.classifier[5](activation_tensor)

        # Pass through classifier[6] (final linear layer)
        output = model.classifier[6](relu_output)

        # Apply softmax and get the predicted class
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)

    return predictions


# Project activations into sparse space
def project_activations(autoencoder, activations, device):
    print("Projecting Alexnet activations into SAE sparse space...")
    with torch.no_grad():
        projected = autoencoder.encoder(torch.from_numpy(activations).to(device).float())
    return projected.cpu().numpy()

# Save all neuron differences and indexes in descending order
def save_all_neurons_to_csv(abs_diff, folder_name, filename="all_neuron_differences.csv"):
    """
    Save all neuron indexes sorted by their differences (descending order) to a CSV file.
    """
    print(f"Saving all neuron differences to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": np.arange(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

    return csv_path # return the csv_path



def save_top_neurons_to_csv(abs_diff, top_neurons, folder_name, filename="dlp_top_neurons.csv"):
    """
    Save the top neurons with their difference values to a CSV file.
    """
    print(f"Saving top neurons to CSV file: {filename}")

    # Create a DataFrame with neuron indices and their absolute differences
    neuron_data = pd.DataFrame({
        "Neuron_Index": range(len(abs_diff)),
        "Activation_Difference": abs_diff
    })

    # Mark whether each neuron is in the top 10%
    neuron_data["Selected_for_Muting"] = neuron_data["Neuron_Index"].isin(top_neurons)

    # Sort by absolute difference in descending order
    neuron_data.sort_values(by="Activation_Difference", ascending=False, inplace=True)

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(folder_name, filename)
    neuron_data.to_csv(csv_path, index=False)
    print(f"CSV saved at: {csv_path}")

# Function to load top neurons from CSV based on a percentage
def load_top_neurons_from_csv(folder_name, filename, percentage):
    """
    Load top neurons based on the specified percentage from the saved CSV file.
    """
    csv_path = os.path.join(folder_name, filename)
    neuron_data = pd.read_csv(csv_path)

    # Calculate the number of top neurons to select
    top_count = int(len(neuron_data) * (percentage / 100))

    # Select the top neurons based on their activation difference
    top_neurons = neuron_data.iloc[:top_count]["Neuron_Index"].values
    print(f"Loaded top {percentage}% neurons ({top_count} neurons) for muting.")
    return top_neurons


def classify_with_alexnet(model, activations):
    """
    Classify images using the original AlexNet classifier on the fc2 activations.
    """
    predictions = []
    for activation in activations:
        # Convert numpy activation to tensor
        activation_tensor = torch.from_numpy(activation).float().to(device)
        relu_output = model.classifier[5](activation_tensor)  # Apply ReLU
        output = model.classifier[6](relu_output)  # Apply fc3
        prediction = torch.argmax(torch.nn.functional.softmax(output, dim=0)).item()
        predictions.append(prediction)
    return predictions

# Function to calculate accuracy per group
def calculate_group_accuracy(predictions, true_labels):
    return accuracy_score(true_labels, predictions)



def main():

    seeds = [1, 11, 111]

    # Paths to models, activations, and autoencoders for each seed
    model_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/models/initial_classifier/alexnet_mnist_dlp_seed_{seed}.pt"
        for seed in seeds
    ]
    autoencoder_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/Autoencoders/mnist_dlp/dlp_autoencoder_layer_fc2_seed_{seed}_unnormalized.pth"
        for seed in seeds
    ]
    test_activation_two_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/alexnet_mnist_finetune_dlp/fc2_activations_test_two_dlp_alexnet_mnist_finetune_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_two_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/alexnet_mnist_finetune_dlp/fc2_activations_test_two_org_alexnet_mnist_finetune_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/alexnet_mnist_finetune_dlp/fc2_activations_test_zero_dlp_alexnet_mnist_finetune_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    test_activation_zero_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/alexnet_mnist_finetune_dlp/fc2_activations_test_zero_org_alexnet_mnist_finetune_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/alexnet_mnist_finetune_dlp/fc2_activations_val_zero_dlp_alexnet_mnist_finetune_dlp_seed_{seed}.npy"
        for seed in seeds
    ]
    val_activation_no_patch_paths = [
        f"/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/alexnet_mnist_finetune_dlp/fc2_activations_val_zero_org_alexnet_mnist_finetune_dlp_seed_{seed}.npy"
        for seed in seeds
    ]

    # Output folder for results
    folder_name = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/difference_analysis/alexnet_mnist_finetune_dlp"
    os.makedirs(folder_name, exist_ok=True)

    all_results = []

    # Loop through seeds/models/autoencoders
    for seed_idx, seed in enumerate(seeds):

        print(f"\nProcessing Seed {seed}")
        model_path = model_paths[seed_idx]
        sae_path = autoencoder_paths[seed_idx]
        val_patch_path = val_activation_patch_paths[seed_idx]
        val_no_patch_path = val_activation_no_patch_paths[seed_idx]
        test_two_patch_path = test_activation_two_patch_paths[seed_idx]
        test_two_no_patch_path = test_activation_two_no_patch_paths[seed_idx]
        test_zero_patch_path = test_activation_zero_patch_paths[seed_idx]
        test_zero_no_patch_path = test_activation_zero_no_patch_paths[seed_idx]

        print(f"Model Path: {model_path}")
        print(f"Autoencoder Path: {sae_path}")
        print(f"Validation Patch Path: {val_patch_path}")
        print(f"Validation No Patch Path: {val_no_patch_path}")
        print(f"Test Two Patch Path: {test_two_patch_path}")
        print(f"Test Two No Patch Path: {test_two_no_patch_path}")
        print(f"Test Zero Patch Path: {test_zero_patch_path}")
        print(f"Test Zero No Patch Path: {test_zero_no_patch_path}")

        # Load the model and autoencoder
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = load_model(model_path, device)
        autoencoder = load_autoencoder(device, sae_path)

        # Load pre-saved activations
        val_activations_patch = np.load(val_patch_path, allow_pickle=True)
        val_activations_no_patch = np.load(val_no_patch_path, allow_pickle=True)

        # Direct classification using AlexNet
        predictions_val_patch_alexnet = classify_with_alexnet(model, val_activations_patch)
        accuracy_val_patch_alexnet = accuracy_score([0] * len(predictions_val_patch_alexnet), predictions_val_patch_alexnet)

        predictions_val_no_patch_alexnet = classify_with_alexnet(model, val_activations_no_patch)
        accuracy_no_patch_alexnet = accuracy_score([0] * len(predictions_val_no_patch_alexnet), predictions_val_no_patch_alexnet)

        # Classification before muting
        predictions_val_patch_before = classify_with_alexnet(model, val_activations_patch)
        predictions_val_no_patch_before = classify_with_alexnet(model, val_activations_no_patch)

        # Calculate validation accuracies before muting
        accuracy_val_patch_before = calculate_group_accuracy(predictions_val_patch_before, [0] * len(predictions_val_patch_before))
        accuracy_val_no_patch_before = calculate_group_accuracy(predictions_val_no_patch_before, [0] * len(predictions_val_no_patch_before))



        # Project validation activations into sparse space
        projected_val_patch = project_activations(autoencoder, val_activations_patch, device)
        projected_val_no_patch = project_activations(autoencoder, val_activations_no_patch, device)

        # Calculate differences and save neuron indexes
        avg_val_patch = np.mean(projected_val_patch, axis=0)
        avg_val_no_patch = np.mean(projected_val_no_patch, axis=0)
        abs_diff = np.abs(avg_val_patch - avg_val_no_patch)
        csv_path = save_all_neurons_to_csv(abs_diff, folder_name, filename=f"val_neuron_differences_seed_{seed}.csv")

        csv_folder = "/content/drive/MyDrive/Masterthesis/Datasets/mnist/activations/difference_analysis/alexnet_mnist_finetune_dlp"
        csv_filename = f"val_neuron_differences_seed_{seed}.csv"
        neurons_to_mute = load_top_neurons_from_csv(csv_folder, filename=f"val_neuron_differences_seed_{seed}.csv", percentage = 3)
        #neurons_to_mute = top_neurons[:int(len(top_neurons) * 0.1)]
        projected_val_patch[:, neurons_to_mute] = 0
        projected_val_no_patch[:, neurons_to_mute] = 0

        decoded_val_patch = autoencoder.decoder(torch.from_numpy(projected_val_patch).to(device).float()).cpu().numpy()
        decoded_val_no_patch = autoencoder.decoder(torch.from_numpy(projected_val_no_patch).to(device).float()).cpu().numpy()

        predictions_val_patch_after = classify_decoded_activations(model, decoded_val_patch)
        predictions_val_no_patch_after = classify_decoded_activations(model, decoded_val_no_patch)

        accuracy_val_patch_after = calculate_group_accuracy(predictions_val_patch_after, [0] * len(predictions_val_patch_after))
        accuracy_val_no_patch_after = calculate_group_accuracy(predictions_val_no_patch_after, [0] * len(predictions_val_no_patch_after))

        # Print Validation Results
        print(f"Validation Accuracy (Patch, Before Muting): {accuracy_val_patch_before:.4f}")
        print(f"Validation Accuracy (No Patch, Before Muting): {accuracy_val_no_patch_before:.4f}")
        print(f"Validation Accuracy (Patch, After Muting): {accuracy_val_patch_after:.4f}")
        print(f"Validation Accuracy (No Patch, After Muting): {accuracy_val_no_patch_after:.4f}")

############################################################################################################################################
        # Test Phase
        test_two_patch = np.load(test_activation_two_patch_paths[seed_idx], allow_pickle=True)
        test_two_no_patch = np.load(test_activation_two_no_patch_paths[seed_idx], allow_pickle=True)
        test_zero_patch = np.load(test_activation_zero_patch_paths[seed_idx], allow_pickle=True)
        test_zero_no_patch = np.load(test_activation_zero_no_patch_paths[seed_idx], allow_pickle=True)



        # Test classification before muting
        predictions_test_two_patch_before = classify_with_alexnet(model, test_two_patch)
        predictions_test_two_no_patch_before = classify_with_alexnet(model, test_two_no_patch)
        predictions_test_zero_patch_before = classify_with_alexnet(model, test_zero_patch)
        predictions_test_zero_no_patch_before = classify_with_alexnet(model, test_zero_no_patch)



        accuracy_test_two_patch_before = calculate_group_accuracy(predictions_test_two_patch_before, [1] * len(predictions_test_two_patch_before))
        accuracy_test_two_no_patch_before = calculate_group_accuracy(predictions_test_two_no_patch_before, [1] * len(predictions_test_two_no_patch_before))
        accuracy_test_zero_patch_before = calculate_group_accuracy(predictions_test_zero_patch_before, [0] * len(predictions_test_zero_patch_before))
        accuracy_test_zero_no_patch_before = calculate_group_accuracy(predictions_test_zero_no_patch_before, [0] * len(predictions_test_zero_no_patch_before))

        # Load top neurons from validation
        top_neurons = load_top_neurons_from_csv(csv_folder, filename=f"mnist_dlp_val_neuron_differences_seed_{seed}.csv", percentage=3)

        # Project test activations into sparse space
        projected_two_test_patch = project_activations(autoencoder, test_two_patch, device)
        projected_two_test_no_patch = project_activations(autoencoder, test_two_no_patch, device)
        projected_zero_test_patch = project_activations(autoencoder, test_zero_patch, device)
        projected_zero_test_no_patch = project_activations(autoencoder, test_zero_no_patch, device)

        #top_neuron_count = int(len(abs_diff) * 0.1)
        #top_neurons = np.argsort(abs_diff)[-top_neuron_count:]

        # Muting neurons in test data
        percentage = 10
        neurons_to_mute = load_top_neurons_from_csv(csv_folder, filename=f"mnist_dlp_val_neuron_differences_seed_{seed}.csv", percentage = 3)
        projected_two_test_patch[:, neurons_to_mute] = 0
        projected_two_test_no_patch[:, neurons_to_mute] = 0
        projected_zero_test_patch[:, neurons_to_mute] = 0
        projected_zero_test_no_patch[:, neurons_to_mute] = 0


        # Decode and classify test data
        decoded_two_test_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_patch).to(device).float()).cpu().detach().numpy()
        decoded_two_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_two_test_no_patch).to(device).float()).cpu().detach().numpy()
        decoded_zero_test_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_patch).to(device).float()).cpu().detach().numpy()
        decoded_zero_test_no_patch = autoencoder.decoder(torch.from_numpy(projected_zero_test_no_patch).to(device).float()).cpu().detach().numpy()

        # Classification
        predictions_test_two_patch_after = classify_decoded_activations(model, decoded_two_test_patch)
        predictions_test_two_no_patch_after = classify_decoded_activations(model, decoded_two_test_no_patch)
        predictions_test_zero_patch_after = classify_decoded_activations(model, decoded_zero_test_patch)
        predictions_test_zero_no_patch_after = classify_decoded_activations(model, decoded_zero_test_no_patch)

        accuracy_test_two_patch_after = calculate_group_accuracy(predictions_test_two_patch_after, [1] * len(predictions_test_two_patch_after))
        accuracy_test_two_no_patch_after = calculate_group_accuracy(predictions_test_two_no_patch_after, [1] * len(predictions_test_two_no_patch_after))
        accuracy_test_zero_patch_after = calculate_group_accuracy(predictions_test_zero_patch_after, [0]* len(predictions_test_zero_patch_after))
        accuracy_test_zero_no_patch_after = calculate_group_accuracy(predictions_test_zero_no_patch_after, [0] * len(predictions_test_zero_no_patch_after))

        # Calculate Worst and Average Group Accuracies
        worst_group_accuracy_before = accuracy_test_two_patch_before
        worst_group_accuracy_after = accuracy_test_two_patch_after

        # Calculate the average group accuracy correctly by dividing the sum of accuracies by 4
        avg_group_accuracy_before = (accuracy_test_two_patch_before + accuracy_test_two_no_patch_before + accuracy_test_zero_patch_before + accuracy_test_zero_no_patch_before) / 4
        avg_group_accuracy_after = (accuracy_test_two_patch_after + accuracy_test_two_no_patch_after + accuracy_test_zero_patch_after + accuracy_test_zero_no_patch_after) / 4
        # Print Test Results
        print(f"Test Accuracy (Patch, Before Muting): {accuracy_test_two_patch_before:.4f}")
        print(f"Test Accuracy (No Patch, Before Muting): {accuracy_test_two_no_patch_before:.4f}")
        print(f"Test Accuracy (Patch, After Muting): {accuracy_test_zero_patch_after:.4f}")
        print(f"Test Accuracy (No Patch, After Muting): {accuracy_test_zero_no_patch_after:.4f}")
        print(f"Worst Group Accuracy (Before Muting): {worst_group_accuracy_before:.4f}")
        print(f"Worst Group Accuracy (After Muting): {worst_group_accuracy_after:.4f}")
        print(f"Average Group Accuracy (Before Muting): {avg_group_accuracy_before:.4f}")
        print(f"Average Group Accuracy (After Muting): {avg_group_accuracy_after:.4f}")


if __name__ == "__main__":
    main()
