In [19]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Function
import math

class adder2_0(Function):
    """
    AdderNet 2.0 forward operation with FBR (Fusion Bias Removal)
    """
    @staticmethod
    def forward(ctx, W_col, X_col):
        # W_col: (out_channels, in_channels*k*k) - already quantized integers
        # X_col: (in_channels*k*k, locations*batch) - activations
        ctx.save_for_backward(W_col, X_col)

        # Core AdderNet operation: -Σ|W - X|
        output = -(W_col.unsqueeze(2) - X_col.unsqueeze(0)).abs().sum(1)

        return output

    @staticmethod
    def backward(ctx, grad_output):
        W_col, X_col = ctx.saved_tensors

        grad_W_col = ((X_col.unsqueeze(0) - W_col.unsqueeze(2)) * grad_output.unsqueeze(1)).sum(2)
        grad_W_col = grad_W_col / grad_W_col.norm(p=2).clamp(min=1e-12) * math.sqrt(W_col.size(1) * W_col.size(0)) / 5
        grad_X_col = (-(X_col.unsqueeze(0) - W_col.unsqueeze(2)).clamp(-1, 1) * grad_output.unsqueeze(1)).sum(0)

        return grad_W_col, grad_X_col

class adder2d2_0(nn.Module):
    def __init__(self, input_channel, output_channel, kernel_size,
                 stride=1, padding=0, bias=False):
        super(adder2d2_0, self).__init__()
        self.stride = stride
        self.padding = padding
        self.input_channel = input_channel
        self.output_channel = output_channel
        self.kernel_size = kernel_size

        # Weight parameter: (out_channels, in_channels, k, k)
        # During training: contains float weights
        # After FBR preprocessing: contains quantized integer weights
        self.adder = torch.nn.Parameter(
            nn.init.normal_(
                torch.randn(output_channel, input_channel, kernel_size, kernel_size)
            )
        )

        self.bias = bias
        if bias:
            self.b = torch.nn.Parameter(nn.init.uniform_(torch.zeros(output_channel)))

    def forward(self, x):
        # x: (batch, in_channels, H, W)
        n_x, d_x, h_x, w_x = x.size()
        n_filters = self.output_channel

        # Calculate output dimensions
        h_out = (h_x - self.kernel_size + 2 * self.padding) // self.stride + 1
        w_out = (w_x - self.kernel_size + 2 * self.padding) // self.stride + 1

        # Unfold input into columns
        # X_col shape: (batch, in_channels*k*k, h_out*w_out)
        X_col = torch.nn.functional.unfold(
            x.view(1, -1, h_x, w_x),
            self.kernel_size,
            dilation=1,
            padding=self.padding,
            stride=self.stride
        ).view(n_x, -1, h_out * w_out)

        # Reshape: (in_channels*k*k, h_out*w_out*batch)
        X_col = X_col.permute(1, 2, 0).contiguous().view(X_col.size(1), -1)

        # Reshape weights: (out_channels, in_channels*k*k)
        W_col = self.adder.view(n_filters, -1)

        # Apply adder operation (NO quantization logic here!)
        out = adder2_0.apply(W_col, X_col)

        # Reshape output: (out_channels, h_out, w_out, batch) -> (batch, out_channels, h_out, w_out)
        out = out.view(n_filters, h_out, w_out, n_x)
        out = out.permute(3, 0, 1, 2).contiguous()

        # Add bias if needed
        if self.bias:
            out += self.b.unsqueeze(0).unsqueeze(2).unsqueeze(3)

        return out

In [20]:
import torch.nn as nn
#from Adder2_0 import adder2d2_0

class ResidualBlock2_0(nn.Module):
    """
    Residual Block for AdderNet 2.0 with Fusion Bias Removal (FBR)

    Uses standard BatchNorm2d since the FBR preprocessing (post_proc_act_quant.py)
    adjusts the BatchNorm parameters offline. No dynamic weight bias adjustment needed.
    """
    def __init__(self, in_channels, out_channels, kernel_size=3,
                 stride=1, padding=1, downsample=None):
        super(ResidualBlock2_0, self).__init__()

        self.adder1 = adder2d2_0(in_channels, out_channels, kernel_size,
                                 stride=stride, padding=padding, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()

        self.adder2 = adder2d2_0(out_channels, out_channels, kernel_size,
                                 stride=1, padding=padding, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.downsample = downsample

    def forward(self, x):
        residual = x

        # First adder + BN + ReLU
        out = self.adder1(x)
        out = self.bn1(out)
        out = self.relu(out)

        # Second adder + BN
        out = self.adder2(out)
        out = self.bn2(out)

        # Downsample residual if needed
        if self.downsample is not None:
            downsample_adder, downsample_bn = self.downsample
            residual = downsample_adder(x)
            residual = downsample_bn(residual)

        # Add residual and apply ReLU
        out += residual
        out = self.relu(out)

        return out

In [21]:
import torch.nn as nn
#from block2_0 import ResidualBlock2_0
#from Adder2_0 import adder2d2_0

class Layer2_0(nn.Module):
    """
    Layer composition for AdderNet 2.0 with Fusion Bias Removal (FBR)

    Creates a sequence of residual blocks with adder operations.
    Uses standard BatchNorm2d since FBR preprocessing handles bias fusion offline.
    """
    def __init__(self, in_channels, out_channels, num_blocks=3):
        super(Layer2_0, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels

        downsample = None
        stride = 1

        # Create downsample path if channel dimensions change
        if in_channels != out_channels:
            self.downsample_adder = adder2d2_0(in_channels, out_channels, kernel_size=1,
                                                stride=2, padding=0, bias=False)
            self.downsample_bn = nn.BatchNorm2d(out_channels)
            downsample = (self.downsample_adder, self.downsample_bn)
            stride = 2
        else:
            stride = 1

        self.blocks = nn.ModuleList()

        # First block (may have stride=2 for downsampling)
        self.blocks.append(ResidualBlock2_0(in_channels=in_channels,
                                            out_channels=out_channels,
                                            stride=stride,
                                            downsample=downsample,
                                            ))

        # Remaining blocks (stride=1)
        for _ in range(num_blocks - 1):
            self.blocks.append(ResidualBlock2_0(in_channels=out_channels,
                                                out_channels=out_channels,
                                                padding=1,
                                                ))

    def forward(self, x):
        out = x
        for block in self.blocks:
            out = block(out)
        return out

In [22]:
import torch.nn as nn
import torch
from torch.nn import functional as F
#from layer2_0 import Layer2_0
class AdderNet2_0(nn.Module):
    """
    AdderNet 2.0 with Fusion Bias Removal (FBR)

    FBR Preprocessing:
    - Adder weights → quantized integers: W_clip ∈ [-2^(q-1), 2^(q-1)-1]
    - BatchNorm running_mean → adjusted: μ' = round(μ/δ) + Σ|W_q - W_clip|
    - BatchNorm bias → quantized: β' = β/δ
    - Final FC weights → scaled: W_fc' = W_fc * δ

    During inference:
    - Adder: outputs -Σ|X - W_clip| (integer operations)
    - BatchNorm: Y = γ * (X - μ') / √(σ² + ε) + β'
    - The weight bias is implicitly handled via the adjusted running_mean
    """
    def __init__(self, num_classes=10, load_weights=None):
        super(AdderNet2_0, self).__init__()

        # Initial convolution layer (standard conv, not quantized)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu = nn.ReLU()

        # Quantized adder layers with FBR
        self.layer1 = Layer2_0(16, 16, num_blocks=3)
        self.layer2 = Layer2_0(16, 32, num_blocks=3)
        self.layer3 = Layer2_0(32, 64, num_blocks=3)

        # Fully connected layer
        self.fc = nn.Conv2d(64, num_classes, 1, bias=False)
        self.bn2 = nn.BatchNorm2d(num_classes)

        if load_weights is not None:
            self.load_manual_weights(load_weights)

        self.activations = {}

    def load_manual_weights(self, weights_dict):
        """
        Expected preprocessing:
        1. Adder weights: quantized integers W_clip
        2. BN running_mean: adjusted with weight bias (μ' = round(μ/δ) + bias_sum)
        3. BN bias: quantized by delta (β' = β/δ)
        4. BN weight: quantized by delta (γ' = γ/δ) for bn1 only
        5. FC weights: scaled by delta (W_fc' = W_fc * δ)
        """
        with torch.no_grad():
            for name, param in self.named_parameters():
                if name in weights_dict:
                    weight_value = weights_dict[name]
                    if weight_value.shape == param.shape:
                        param.copy_(weight_value.to(param.device))

            for name, buffer in self.named_buffers():
                if name in weights_dict:
                    buffer_value = weights_dict[name]
                    if buffer_value.shape == buffer.shape:
                        buffer.copy_(buffer_value.to(buffer.device))

    def forward(self, x, save_activations=False):
        if save_activations:
            self.activations['input_activation_2.0'] = x.clone()

        # Initial conv + BN + ReLU
        # Note: bn1.weight and bn1.bias are divided by delta
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        if save_activations:
            self.activations['prelayer_activation_2.0'] = out.clone()

        # Quantized adder layers with FBR
        out = self.layer1(out)
        if save_activations:
            self.activations['layer1_activation_2.0'] = out.clone()

        out = self.layer2(out)
        if save_activations:
            self.activations['layer2_activation_2.0'] = out.clone()

        out = self.layer3(out)
        if save_activations:
            self.activations['layer3_activation_2.0'] = out.clone()

        # Global average pooling
        out = F.avg_pool2d(out, 8)

        # Final FC layer (weights scaled by delta in preprocessing)
        out = self.fc(out)
        out = self.bn2(out)
        out = out.view(out.size(0), -1)

        return out

    def classification(self, x):
        out = self.forward(x)
        return F.softmax(out, dim=1)

In [23]:
import pandas as pd
import numpy as np

class EvolutionAlgorithmBase:
    """
    Base class for Evolution Algorithms.
    """
    def __init__(self, func, n_dim, size_pop, max_iter, prob_mut):
        self.func = func
        self.n_dim = n_dim
        self.size_pop = size_pop
        self.max_iter = max_iter
        self.prob_mut = prob_mut

        # History containers
        self.generation_best_X = []
        self.generation_best_Y = []
        self.all_history_Y = []
        self.all_history_FitV = []

    def run(self):
        pass


class DE(EvolutionAlgorithmBase):
    """
    Differential Evolution (DE) Algorithm.

    This class implements the Differential Evolution algorithm for activation cutoff optimization.
    It uses a loop-based approach to ensure distinct candidate selection for mutation.
    This implementation maximizes the accuracy.

    Parameters:
    -----------
    func: callable
        The objective function to minimize.
    n_dim: int
        The dimension of the search space (Layers*blocks*2 + 2) = 20.
    F: float
        The mutation factor (differential weight).
    size_pop: int
        The size of the population.
    max_iter: int
        The maximum number of iterations/generations.
    lb: array
        Lower bounds for the activation values.
    ub: array
        Upper bounds for the activation values.
    prob_mut: float (optional, default 0.7)
        The crossover rate (CR).
    """
    def __init__(self, func, F, lb, ub,
                 size_pop, n_dim, max_iter, prob_mut):
        # Note: 'prob_mut' corresponds to 'cr' (crossover rate) in sample code
        super().__init__(func, n_dim, size_pop, max_iter, prob_mut)

        self.F = F
        self.lb = np.array(lb) * np.ones(self.n_dim)
        self.ub = np.array(ub) * np.ones(self.n_dim)

        # Initialize population
        self.crtbp()
        # Evaluate initial population
        self.Y = np.array([self.func(x) for x in self.X])

    def crtbp(self):
      """Create the initial population"""
      # de.X = np.random.uniform(self.lb, self.ub, (self.size_pop, self.n_dim))
      # de.X = np.round(de.X, 2)

      self.X = np.array([
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.67, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.59, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.61, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.64, 2.4, 2.61, 2.45, 2.62, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.65, 2.4, 2.61, 2.45, 2.69, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.63, 2.4, 2.61, 2.45, 2.59, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.66, 2.4, 2.61, 2.45, 2.65, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.67, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.65, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.6, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.66, 2.4, 2.61, 2.45, 2.65, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.7, 2.4, 2.61, 2.45, 2.53, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.73, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.69, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.66, 2.4, 2.61, 2.45, 2.68, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.63, 2.4, 2.61, 2.45, 2.64, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.63, 2.4, 2.61, 2.45, 2.59, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.63, 2.4, 2.61, 2.45, 2.65, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.63, 2.4, 2.61, 2.45, 2.64, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.53, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.67, 2.4, 2.61, 2.45, 2.56, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.62, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.62, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.62, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.65, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.59, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.63, 2.4, 2.61, 2.45, 2.64, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.62, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.6, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.6, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.69, 2.4, 2.61, 2.45, 2.53, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.6, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.56, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.67, 2.4, 2.61, 2.45, 2.67, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.64, 2.4, 2.61, 2.45, 2.69, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.67, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.65, 2.4, 2.61, 2.45, 2.67, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.67, 2.4, 2.61, 2.45, 2.53, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.63, 2.4, 2.61, 2.45, 2.67, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.66, 2.4, 2.61, 2.45, 2.67, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.66, 2.4, 2.61, 2.45, 2.65, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.65, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.65, 2.4, 2.61, 2.45, 2.54, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.69, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.73, 2.68, 2.4, 2.61, 2.45, 2.67, 2.48, 3.09, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.65, 2.4, 2.61, 2.45, 2.69, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.68, 2.4, 2.61, 2.45, 2.62, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.66, 2.4, 2.61, 2.45, 2.58, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.6, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
          [2.4, 2.74, 2.64, 2.4, 2.61, 2.45, 2.61, 2.48, 3.1, 2.5, 2.13, 2.36, 2., 2.52, 2.18, 2., 2.29, 2.21, 2., 2.38],
      ])
      return self.X

    def mutation_op(self, x, F):
        """
        Mutation operation: x[0] + F * (x[1] - x[2])
        x is a list/array of 3 vectors [a, b, c]
        """
        return x[0] + F * (x[1] - x[2])

    def check_bounds(self, mutated):
        """Boundary check operation using clip"""
        return np.clip(mutated, self.lb, self.ub)

    def crossover_op(self, mutated, target, cr):
        """
        Crossover operation
        """
        # generate a uniform random value for every dimension
        p = np.random.rand(self.n_dim)

        # ensure at least one parameter is from mutated vector
        j_rand = np.random.randint(0, self.n_dim)

        # Apply the crossover logic:
        # Use mutant value if rand <= CR OR if it's the forced index
        trial_vector = np.where((p <= cr) | (np.arange(self.n_dim) == j_rand),
                                mutated,
                                target)
        # Round to hundredths place (2 decimal places)
        trial_vector = np.round(trial_vector, 2)
        return trial_vector

    def run(self, max_iter=None):
        self.max_iter = max_iter or self.max_iter
        print("start run")
        # Initial Best
        best_idx = np.argmax(self.Y)
        self.best_x = self.X[best_idx].copy()
        self.best_y = self.Y[best_idx]

        for i in range(self.max_iter):
            # Iterate over all candidate solutions
            for j in range(self.size_pop):
                # Choose three candidates a, b, c that are not the current one
                # to ensure distinct indices for mutation
                candidates = [idx for idx in range(self.size_pop) if idx != j]
                a_idx, b_idx, c_idx = np.random.choice(candidates, 3, replace=False)

                a = self.X[a_idx]
                b = self.X[b_idx]
                c = self.X[c_idx]
                # Perform mutation
                mutated = self.mutation_op([a, b, c], self.F)

                # Check bounds
                mutated = self.check_bounds(mutated)
                # Perform crossover
                trial = self.crossover_op(mutated, self.X[j], self.prob_mut)
                print(f"trial: {trial}")

                # Compute objective function value for trial vector
                # (Assuming func takes a single vector)
                if hasattr(self.func, 'batch_mode') and self.func.batch_mode:
                     # Handle batch if necessary, but sample assumes single
                     obj_trial = self.func(trial.reshape(1, -1))[0]
                else:
                     obj_trial = self.func(trial)

                print(f"Trial {j} accuracy: {obj_trial:.2f}%")

                obj_target = self.Y[j]

                # Perform selection
                if obj_trial > obj_target:
                    # Replace the target vector with the trial vector
                    self.X[j] = trial
                    self.Y[j] = obj_trial

            # Record the best individual of this generation
            generation_best_index = np.argmax(self.Y)
            current_best_y = self.Y[generation_best_index]

            # Store history
            self.generation_best_X.append(self.X[generation_best_index, :].copy())
            self.generation_best_Y.append(current_best_y)
            self.all_history_Y.append(self.Y.copy())

            # Print progress
            print(f"Generation {i+1}: Best Accuracy = {current_best_y:.2f}%")

            # Update global best
            if current_best_y > self.best_y:
                 self.best_y = current_best_y
                 self.best_x = self.X[generation_best_index].copy()
                        # Elitism: Preserve top 3 solutions with accuracy > 10% for next generation
            if i < self.max_iter - 1:  # Don't need to preserve on last iteration
                # Find indices where accuracy > 10%
                good_indices = np.where(self.Y > 10.0)[0]

                if len(good_indices) > 0:
                    # Sort by accuracy (descending) and take top 3
                    sorted_good_indices = good_indices[np.argsort(self.Y[good_indices])[::-1]]
                    elite_count = min(3, len(sorted_good_indices))
                    elite_indices = sorted_good_indices[:elite_count]

                    # Store elite solutions
                    elite_X = self.X[elite_indices].copy()
                    elite_Y = self.Y[elite_indices].copy()

                    # After evolution completes for this generation, inject elites into population
                    # Replace the worst solutions with the elite solutions
                    worst_indices = np.argsort(self.Y)[:elite_count]
                    self.X[worst_indices] = elite_X
                    self.Y[worst_indices] = elite_Y

        return self.best_x, self.best_y

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import os
import sys
import numpy as np

def load_data(batch_size=100):
    print('Preparing data..')
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=0)
    return testloader

def get_adder_layer_keys(state_dict):
    keys = []
    for key in state_dict.keys():
        if key.startswith('layer') and key.endswith('adder'):
            keys.append(key)
    return keys

def quantization_objective(x, fixed_state_dict, adder_keys, bits, device, testloader):
    """
    Objective function for DE.
    x: array of max_activation_val scalars, one for each adder layer.
    """
    quantized_state_dict = {k: v.clone() for k, v in fixed_state_dict.items()}

    Max_A = 2**(bits) - 1
    Max_B = 0

    # Helper to calculate delta
    def get_delta(max_val):
        return max_val / Max_A

    delta_first = get_delta(x[0])
    delta_last = get_delta(x[-1])

    # Quantize bn1
    quantized_state_dict['bn1.weight'] = quantized_state_dict['bn1.weight'] / delta_first
    quantized_state_dict['bn1.bias'] = quantized_state_dict['bn1.bias'] / delta_first

    # Pre-calculate deltas for all adder layers
    layer_deltas = {key: get_delta(val) for key, val in zip(adder_keys, x)}

    bias_sums = {}

    # Process layers sequentially
    current_delta = delta_first
    current_bias_sum = 0

    for name in quantized_state_dict.keys():
        if name in adder_keys:
            # apply AOQ to weights
            w_tensor = quantized_state_dict[name]
            current_delta = layer_deltas[name]
            wq = torch.round(w_tensor / current_delta)
            wq_clamp = torch.clamp(wq, max=Max_A, min=Max_B)
            quantized_state_dict[name] = wq_clamp

            # Calculate bias sum for FBR
            bias_tensor = (wq - wq_clamp).abs()
            current_bias_sum = torch.sum(bias_tensor, dim=(1,2,3))

        elif name.startswith('layer'):
            # Handle BN parameters for layers (layer1, layer2, etc.)
            # Assumes these come AFTER their corresponding adder layer
            if name.endswith('running_mean'):
                m_tensor = quantized_state_dict[name]
                mq = torch.round(m_tensor / current_delta)
                quantized_state_dict[name] = mq + current_bias_sum

            elif name.endswith('bias') and 'bn' in name:
                x_tensor = quantized_state_dict[name]
                xq_tensor = x_tensor / current_delta
                quantized_state_dict[name] = xq_tensor

    # Quantize FC
    quantized_state_dict['fc.weight'] = quantized_state_dict['fc.weight'] * delta_last

    # Evaluate
    quant_model = AdderNet2_0(num_classes=10).to(device)
    quant_model.load_manual_weights(quantized_state_dict)
    quant_model.eval()

    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in testloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = quant_model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    accuracy = 100. * correct / total
    return accuracy

def run_optimization():
    print("-+" * 25)
    print("Starting DE Optimization for AdderNet2.0 Quantization")
    print("-+" * 25)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Load Data
    testloader = load_data(batch_size=200)

    # Load Pretrained Model Weights
    print('Loading pretrained weights...')
    #model_dir = '/kaggle/input/state-dictionary'
    # files = os.listdir(model_dir)
    #model_path = os.path.join(model_dir, 'AdderNet_model.pth')
    model_path = 'AdderNet_model.pth'

    # Load state dictionary
    checkpoint = torch.load(model_path, map_location=device)
    if 'net' in checkpoint:
        state_dict_raw = checkpoint['net']
    else:
        state_dict_raw = checkpoint

    def remap_key(key):
        """Map original checkpoint keys to the correct AdderNet naming."""
        new_key = key.replace('module.', '')

        # conv and batchnorm generic
        if new_key.startswith('conv1.') or new_key.startswith('bn1.') or new_key.startswith('fc.') or new_key.startswith('bn2.'):
            return new_key

        # Process residual layers
        for layer_num in [1, 2, 3]:
            prefix = f'layer{layer_num}.'
            if new_key.startswith(prefix):
                rest = new_key[len(prefix):]  # everything after 'layerX.'

                # If next is block index
                if len(rest) > 0 and rest[0].isdigit():
                    dot_idx = rest.find('.')
                    if dot_idx != -1:
                         block_num = rest[:dot_idx]
                         rest_after_block = rest[dot_idx+1:]

                         # Handle downsample case
                         if rest_after_block.startswith('downsample.'):
                             ds_rest = rest_after_block[len('downsample.'):]
                             if ds_rest.startswith('0.'):
                                 # conv -> adder
                                 return f'layer{layer_num}.downsample_adder.{ds_rest[2:]}'
                             elif ds_rest.startswith('1.'):
                                 return f'layer{layer_num}.downsample_bn.{ds_rest[2:]}'

                         # Otherwise: normal residual block conv/bn -> adder/bn
                         # Check if it is conv1/2 or bn1/2
                         # In original: conv1 -> adder1, conv2 -> adder2
                         if 'conv1.' in rest_after_block:
                             rest_after_block = rest_after_block.replace('conv1.', 'adder1.')
                         elif 'conv2.' in rest_after_block:
                             rest_after_block = rest_after_block.replace('conv2.', 'adder2.')

                         return f'layer{layer_num}.blocks.{block_num}.{rest_after_block}'

        return new_key

    # Apply remapping
    fixed_state_dict = {}
    for k, v in state_dict_raw.items():
        fixed_key = remap_key(k)
        fixed_state_dict[fixed_key] = v

    # Prepare keys
    print("Debug: First 10 keys in fixed_state_dict:")
    for key in list(fixed_state_dict.keys())[:10]:
        print(key)

    adder_keys = get_adder_layer_keys(fixed_state_dict)
    print(f"Found {len(adder_keys)} adder layers to optimize.")
    if len(adder_keys) == 0:
        print("Error: No adder layers found. Check model keys or filtering logic.")
        return

    # DE Parameters
    n_dim = 20
    size_pop = 50
    max_iter = 50
    prob_mut = 0.85 # Also called CR
    F = 0.4

    #lb = [2.0] * 20
    #ub = [4.0] * 20

    # Activation value range: using best results from previous generations
    lb = [2.4, 2.73, 2.63, 2.4, 2.61, 2.45, 2.53, 2.48, 3.09, 2.5, 2.13, 2.36, 2.0, 2.52, 2.18, 2.0, 2.29, 2.21, 2.0, 2.38]
    ub = [2.4, 2.74, 2.67, 2.4, 2.61, 2.45, 2.68, 2.48, 3.10, 2.5, 2.13, 2.36, 2.0, 2.52, 2.18, 2.0, 2.29, 2.21, 2.0, 2.38]

    bit_array = [4] # Bits to be tested (unsigned integer 4)

    for bits in bit_array:
        print(f"\nOptimizing for {bits}-bit quantization...")

        # Define objective wrapper
        def objective(x):
            acc = quantization_objective(x, fixed_state_dict, adder_keys, bits, device, testloader)
            return acc # maximizing accuracy

        de = DE(objective, F, lb, ub, size_pop, n_dim, max_iter, prob_mut)

        best_x, best_acc = de.run()

        print(f"Best Max Vals for {bits}-bit: {best_x}")
        print(f"Best Accuracy: {best_acc:.2f}%")
        print("-+" * 25)

if __name__ == "__main__":
    run_optimization()


-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Starting DE Optimization for AdderNet2.0 Quantization
-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Using device: cuda
Preparing data..
Loading pretrained weights...
Debug: First 10 keys in fixed_state_dict:
conv1.weight
bn1.weight
bn1.bias
bn1.running_mean
bn1.running_var
bn1.num_batches_tracked
layer1.blocks.0.adder1.adder
layer1.blocks.0.bn1.weight
layer1.blocks.0.bn1.bias
layer1.blocks.0.bn1.running_mean
Found 20 adder layers to optimize.

Optimizing for 4-bit quantization...
start run
trial: [2.4  2.73 2.66 2.4  2.61 2.45 2.68 2.48 3.1  2.5  2.13 2.36 2.   2.52
 2.18 2.   2.29 2.21 2.   2.38]
Trial 0 accuracy: 89.19%
trial: [2.4  2.74 2.64 2.4  2.61 2.45 2.59 2.48 3.1  2.5  2.13 2.36 2.   2.52
 2.18 2.   2.29 2.21 2.   2.38]
Trial 1 accuracy: 89.18%
trial: [2.4  2.74 2.65 2.4  2.61 2.45 2.53 2.48 3.1  2.5  2.13 2.36 2.   2.52
 2.18 2.   2.29 2.21 2.   2.38]
Trial 2 accuracy: 88.99%
trial: [2.4  2.73 2.67 2.4  2.61 2.45 2.65 2

KeyboardInterrupt: 