## Dataset

In [12]:
import numpy as np
import struct
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
def load_mnist_images(filename):
    with open(filename, 'rb') as f:
        # Leggi intestazione: magic number, numero immagini, righe, colonne
        magic, num_images, rows, cols = struct.unpack(">IIII", f.read(16))
        # Leggi tutti i pixel e convertili in array numpy
        images = np.frombuffer(f.read(), dtype=np.uint8)
        # Ridimensiona l'array in (num_images, rows, cols)
        images = images.reshape((num_images, rows, cols))
    return images

def load_mnist_labels(filename):
    with open(filename, 'rb') as f:
        magic, num_labels = struct.unpack(">II", f.read(8))
        labels = np.frombuffer(f.read(), dtype=np.uint8)
    return labels
#-------------- Data Extraction ---------------------------

train_images = load_mnist_images('MNIST/train-images-idx3-ubyte')
train_labels = load_mnist_labels('MNIST/train-labels-idx1-ubyte')

test_images = load_mnist_images('MNIST/t10k-images.idx3-ubyte')
test_labels = load_mnist_labels('MNIST/t10k-labels.idx1-ubyte')

#--------------- Train data manipulation ------------------
print(train_images.shape)  # (60000, 28, 28)
print(train_labels.shape)  # (60000,)
one_hot_labels = np.zeros(train_labels.shape[0]*10).reshape((train_labels.shape[0]),10)
for i in range(len(train_labels)):
    one_hot_labels[i][train_labels[i]]=1
train_labels = one_hot_labels
print(train_labels.shape) # (60000,10)

#--------------- Test data manipulation -------------------
print(test_images.shape)  # (10000, 28, 28)
print(test_labels.shape)  # (10000,)
one_hot_labels = np.zeros(test_labels.shape[0]*10).reshape((test_labels.shape[0]),10)
for i in range(len(test_labels)):
    one_hot_labels[i][test_labels[i]]=1
test_labels = one_hot_labels
print(test_labels.shape) # (10000,10)

(60000, 28, 28)
(60000,)
(60000, 10)
(10000, 28, 28)
(10000,)
(10000, 10)


## CNN - PyTorch

The PyTorch model will be used as a reference to compute the weights since it's the fastest in training and the least prone to errors. If everything is written well, both slow and fast implementations of a CNN in numpy will give the same result, since the weights are the same.

### Model and Dataset Declaration with Training

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import time
from tqdm import tqdm 

# 1.------------------ CNN declaration -------------------

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()

        # --------- Convolutional Layers ------------
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=2, stride=2, padding=0)
        self.relu1 = nn.ReLU()

        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=2, stride=2, padding=1)
        self.relu2 = nn.ReLU()

        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=2, stride=2, padding=0)
        self.relu3 = nn.ReLU()
        # ---------- Flatten to become MLP's input -----------
        self.flatten = nn.Flatten()
        fc_input_size = 128 * 4 * 4
        # ---------- Multi Layer Perceptron ---------------
        # Only one hidden layer for classification
        self.fc1 = nn.Linear(in_features=fc_input_size, out_features=250)
        self.relu4 = nn.ReLU()
        self.fc2 = nn.Linear(in_features=250, out_features=num_classes)

    def forward(self, x):
        # First convolution: from 1x1x28x28 to 1x32x14x14
        x = self.conv1(x)
        x = self.relu1(x)
        # Second Convolution: from 1x32x14x14 to 1x64x8x8
        x = self.conv2(x)
        x = self.relu2(x)
        # Third Convolution: from 1x64x8x8 to 1x128x4x4
        x = self.conv3(x)
        x = self.relu3(x)
        # Flatten
        x = self.flatten(x)
        # MLP
        x = self.fc1(x)
        x = self.relu4(x)
        out_fc1_relu = x.clone() # Salva
        x = self.fc2(x)

        return x, out_fc1_relu

# # 2.------------------ CNN's Dataset declaration ----------------------

# class CNNDataset(Dataset):
#     def __init__(self, digits, labels, transform=None):
#         assert len(digits) == len(labels), "Number of digits and labels doesn't match"
#         self.digits = digits
#         self.labels = labels

#     def __len__(self):
#         return len(self.digits)

#     def __getitem__(self, idx):
#         digit = self.digits[idx]
#         label = self.labels[idx]
#         digit = digit.unsqueeze(0) # Needed operation to add the dimension of greyscale images (28,28) -> (1,28,28)
#         return digit, label

# tri = torch.from_numpy(train_images).float() / 255
# trl = torch.from_numpy(train_labels).float()
# tsi = torch.from_numpy(test_images).float() / 255
# tsl = torch.from_numpy(test_labels).float()

# train_dataset = CNNDataset(tri,trl)
# test_dataset = CNNDataset(tsi,tsl)

# batch_size = 128
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# # 3.------ Training Setup ---------------

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# print(f"device: {device}")

# model = SimpleCNN(num_classes=10).to(device)

# # Loss definition
# criterion = nn.CrossEntropyLoss() 

# # Optimisation definition
# learning_rate = 0.001
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# num_epochs = 5 

# # 4.------- cycle training ------

# print("\nStarting Training...")
# for epoch in range(num_epochs):

#     model.train() 

#     running_loss = 0.0
#     start_time = time.time()
#     #tqdm is module used to have a progress bar
#     progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

#     for inputs, labels in progress_bar:

#         # move data on the device
#         inputs, labels = inputs.to(device), labels.to(device)

#         # make all gradients zero to avoid learning on gradients of previous steps
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(inputs) 
#         # loss computation
#         loss = criterion(outputs, labels)

#         # Backward pass: compute the gradients
#         loss.backward()

#         # Weights update
#         optimizer.step()

#         # Update the loss
#         running_loss += loss.item() * inputs.size(0) # multiply for batch size to obtain the correct mean

#         # Update the progress bar
#         progress_bar.set_postfix(loss=f"{loss.item():.4f}")

#     # Epochs' mean loss computation
#     epoch_loss = running_loss / len(train_loader.dataset)
#     epoch_time = time.time() - start_time

#     print(f"Epoch {epoch+1}/{num_epochs} - Tempo: {epoch_time:.2f}s - Training Loss: {epoch_loss:.4f}")

#     # --- Test evaluation (after every epoch) ---
#     model.eval()
#     test_loss = 0.0
#     correct = 0
#     total = 0

#     with torch.no_grad(): # Disable gradient computation (we don't need gradients since we don't want to update the model in this phase)
#         i=0
#         for inputs, labels in test_loader:
#             if i >= 1:
#                 continue
#             inputs, labels = inputs.to(device), labels.to(device)
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             test_loss += loss.item() * inputs.size(0)
#             _, predicted = torch.max(outputs.data, 1) # Obtain index with the maximum probability (it is our result)
#             _,labels = torch.max(labels,1) # same for the test labels
#             total += labels.size(0)
#             correct += (predicted==labels).sum().item()
#             i+=1

#     avg_test_loss = test_loss / len(test_loader.dataset)
#     accuracy = 100 * correct / total
#     print(f"Epoch {epoch+1}/{num_epochs} - Test Loss: {avg_test_loss:.4f} - Test Accuracy: {accuracy:.2f}%")


# print("\nTraining Complete.")
# #2m 9.4 secondi per avere un'epoca con cuda
# # save the model
# torch.save(model.state_dict(), 'simple_cnn_mnist.pth')

### Weights extraction

In [14]:
model = SimpleCNN(num_classes=10)
model.load_state_dict(torch.load('simple_cnn_mnist.pth', map_location=torch.device('cpu'),weights_only=True)) # Carica su CPU

model.eval() # good practice is to set model in evaluation when you want to extract weights

# --- Parameters Extraction ⛏️ and Numpy Conversion ---

# Weights container
numpy_weights = {}

# Move model on cpu
model.to('cpu')

print("⛏️ Weights and Bias Extraction ⛏️\n")

# Layer Conv1
# PyTorch weight shape: (out_channels, in_channels, kernel_height, kernel_width)
# NumPy expected: (in_channels, out_channels, kernel_width, kernel_height) -> (1, 32, 3, 3)
pyt_k1_w = model.conv1.weight.data.detach().numpy()
# Transpose: (out, in, kH, kW) -> (in, out, kW, kH)
numpy_weights['k1'] = pyt_k1_w

# PyTorch bias shape: (out_channels,)
numpy_weights['b_conv1'] = model.conv1.bias.data.detach().numpy() # Shape (32,)
print(f"k1: PyTorch Shape={pyt_k1_w.shape}, NumPy Shape={numpy_weights['k1'].shape}")
print(f"b_conv1: NumPy Shape={numpy_weights['b_conv1'].shape}")

# Layer Conv2
# PyTorch weight shape: (64, 32, 3, 3)
# NumPy expected: (32, 64, 3, 3)
pyt_k2_w = model.conv2.weight.data.detach().numpy()
numpy_weights['k2'] = pyt_k2_w
numpy_weights['b_conv2'] = model.conv2.bias.data.detach().numpy() # Shape (64,)
print(f"k2: PyTorch Shape={pyt_k2_w.shape}, NumPy Shape={numpy_weights['k2'].shape}")
print(f"b_conv2: NumPy Shape={numpy_weights['b_conv2'].shape}")

# Layer Conv3
# PyTorch weight shape: (128, 64, 3, 3)
# NumPy expected: (64, 128, 3, 3)
pyt_k3_w = model.conv3.weight.data.detach().numpy()
numpy_weights['k3'] = pyt_k3_w
numpy_weights['b_conv3'] = model.conv3.bias.data.detach().numpy() # Shape (128,)
print(f"k3: PyTorch Shape={pyt_k3_w.shape}, NumPy Shape={numpy_weights['k3'].shape}")
print(f"b_conv3: NumPy Shape={numpy_weights['b_conv3'].shape}")

# Layer FC1
# PyTorch weight shape: (out_features, in_features) -> (250, 2048)
# NumPy expected (per input @ W): (in_features, out_features) -> (2048, 250)
pyt_w1 = model.fc1.weight.data.detach().numpy()
numpy_weights['w1'] = pyt_w1.T # Trasponi
# PyTorch bias shape: (out_features,) -> (250,)
# NumPy expected (per aggiunta diretta): (1, out_features) -> (1, 250)
pyt_b1 = model.fc1.bias.data.detach().numpy()
numpy_weights['b1'] = pyt_b1.reshape(1, -1) # Rendi (1, 250)
print(f"w1: PyTorch Shape={pyt_w1.shape}, NumPy Shape={numpy_weights['w1'].shape}")
print(f"b1: PyTorch Shape={pyt_b1.shape}, NumPy Shape={numpy_weights['b1'].shape}")

# Layer FC2
# PyTorch weight shape: (num_classes, 250) -> (10, 250)
# NumPy expected: (250, num_classes) -> (250, 10)
pyt_w2 = model.fc2.weight.data.detach().numpy()
numpy_weights['w2'] = pyt_w2.T # Trasponi
# PyTorch bias shape: (num_classes,) -> (10,)
# NumPy expected: (1, num_classes) -> (1, 10)
pyt_b2 = model.fc2.bias.data.detach().numpy()
numpy_weights['b2'] = pyt_b2.reshape(1, -1) # Rendi (1, 10)
print(f"w2: PyTorch Shape={pyt_w2.shape}, NumPy Shape={numpy_weights['w2'].shape}")
print(f"b2: PyTorch Shape={pyt_b2.shape}, NumPy Shape={numpy_weights['b2'].shape}")

print("\nExtraction complete. Numpy weights are in the dictionary 'numpy_weights'.")

# Access Example:
np_k1 = numpy_weights['k1']
np_b_conv1 = numpy_weights['b_conv1']
np_k2 = numpy_weights['k2']
np_b_conv2 = numpy_weights['b_conv2']
np_k3 = numpy_weights['k3']
np_b_conv3 = numpy_weights['b_conv3']
np_w1 = numpy_weights['w1']
np_b1 = numpy_weights['b1']
np_w2 = numpy_weights['w2']
np_b2 = numpy_weights['b2']



# [[[[-0.06239345  0.16331542  0.28573602]
#    [ 0.299534    0.48019555  0.25194943]
#    [-0.24432278  0.3191273  -0.06802213]]]


#  [[[ 0.10294101 -0.14240074  0.01178457]
#    [ 0.3072691  -0.06823204  0.30347323]
#    [-0.06327374  0.3396498   0.07433306]]]



#    [[[[-0.06239345  0.16331542  0.28573602]
#    [ 0.299534    0.48019555  0.25194943]
#    [-0.24432278  0.3191273  -0.06802213]]

#   [[ 0.10294101 -0.14240074  0.01178457]
#    [ 0.3072691  -0.06823204  0.30347323]
#    [-0.06327374  0.3396498   0.07433306]]

⛏️ Weights and Bias Extraction ⛏️

k1: PyTorch Shape=(32, 1, 2, 2), NumPy Shape=(32, 1, 2, 2)
b_conv1: NumPy Shape=(32,)
k2: PyTorch Shape=(64, 32, 2, 2), NumPy Shape=(64, 32, 2, 2)
b_conv2: NumPy Shape=(64,)
k3: PyTorch Shape=(128, 64, 2, 2), NumPy Shape=(128, 64, 2, 2)
b_conv3: NumPy Shape=(128,)
w1: PyTorch Shape=(250, 2048), NumPy Shape=(2048, 250)
b1: PyTorch Shape=(250,), NumPy Shape=(1, 250)
w2: PyTorch Shape=(10, 250), NumPy Shape=(250, 10)
b2: PyTorch Shape=(10,), NumPy Shape=(1, 10)

Extraction complete. Numpy weights are in the dictionary 'numpy_weights'.


## CNN - NumPy

### Padding

`np.pad()` takes as first argument the matrix to pad and as second argument a set of specification: for every dimension (in our case 4) it takes the number of paddings to add before and after the end of the dimension. If the objective is to pad only the image itself, which is found in the last two dimension, we should write:

`np.pad(img9,((0,0),(0,0),(pad,pad),(pad,pad)))` 

since dimensions are: BATCH, CHANNELS, HEIGHT, WIDTH

In [15]:
img9 = np.arange(1,37).reshape(2,2,3,3)
pad_img9 = np.pad(img9,((0,0),(0,0),(1,1),(1,1)))
print(img9)
print(pad_img9)

[[[[ 1  2  3]
   [ 4  5  6]
   [ 7  8  9]]

  [[10 11 12]
   [13 14 15]
   [16 17 18]]]


 [[[19 20 21]
   [22 23 24]
   [25 26 27]]

  [[28 29 30]
   [31 32 33]
   [34 35 36]]]]
[[[[ 0  0  0  0  0]
   [ 0  1  2  3  0]
   [ 0  4  5  6  0]
   [ 0  7  8  9  0]
   [ 0  0  0  0  0]]

  [[ 0  0  0  0  0]
   [ 0 10 11 12  0]
   [ 0 13 14 15  0]
   [ 0 16 17 18  0]
   [ 0  0  0  0  0]]]


 [[[ 0  0  0  0  0]
   [ 0 19 20 21  0]
   [ 0 22 23 24  0]
   [ 0 25 26 27  0]
   [ 0  0  0  0  0]]

  [[ 0  0  0  0  0]
   [ 0 28 29 30  0]
   [ 0 31 32 33  0]
   [ 0 34 35 36  0]
   [ 0  0  0  0  0]]]]


### Dilating

`dilateOne` adds one zero between each element in the matrix given in input. this is done to be able to do the backward phase with stride 1 even in the forward it was 2, by modifying the gradient of the output. Motivations will be better analyzed in the next sections

In [16]:
def dilateOne(matrix):
    indix = np.arange(1,matrix.shape[3])
    matrix = np.insert(matrix,indix,0,3)
    indix = np.arange(-(matrix.shape[-2]-1),0)
    matrix = np.insert(matrix,indix,0,-2)
    return matrix

### Slow Convolution Layer: Forward

In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# This is a PyTorch Convolution example to be used to check if the convolution implemented in both slow and fast approaches are correct

class CustomConv(nn.Module):
    def __init__(self, kernel: torch.Tensor, bias: torch.Tensor = None, 
                 stride=1, padding=0):
        super().__init__()
        out_ch, in_ch, k_h, k_w = kernel.shape
        self.stride = stride
        self.padding = padding
        
        self.conv = nn.Conv2d(in_channels=in_ch,
                              out_channels=out_ch,
                              kernel_size=(k_h, k_w),
                              stride=stride,
                              padding=padding,
                              bias=(bias is not None))
        with torch.no_grad():
            self.conv.weight.copy_(kernel)
            if bias is not None:
                self.conv.bias.copy_(bias)

        self.conv.weight.requires_grad_(False)
        if bias is not None:
            self.conv.bias.requires_grad_(False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return F.relu(self.conv(x))

def Slow_ReLU_Conv(img,ker,bias=np.array(0),pad=0,stride=1,applyReLU=True):
    if applyReLU: # Forward case
        out_ch, in_ch, k_width, k_height = ker.shape
        nk_channel = out_ch
    else: # Backward case
        in_ch, out_ch, k_width, k_height = ker.shape
        nk_channel = in_ch

    # bias has shape out_ch, 1, 1. It's a scalar value for each channel broadcasted to the kernel's width and height
    # number of channels taken in input by the kernel 'in_ch' 
    # must be the same as the number of channels of the image 'channels'

    img = np.pad(img,((0,0),(0,0),(pad,pad),(pad,pad)))
    n_images, channels, i_height, i_width  = img.shape
    ni_height = int(((i_height - k_height) / stride) + 1) # new image height # Padding is already added
    ni_width = int(((i_width - k_width) / stride) + 1) # new image width
    ni = np.zeros((n_images, out_ch, ni_height, ni_width)).astype(np.float32) # new image

    if in_ch != channels:
        raise ValueError(f"number of channels taken in input by the kernel ({in_ch}) must be the same as the number of channels of the image ({channels})")

    for one_img in range(n_images):
        for one_k_channel in range(nk_channel):
            for i_nih in range(ni_height): # which cycles row by row of the new image
                for i_niw in range(ni_width): # which cycles column by column of the new image
                    current_sum = 0.0 # convolution sum for the specific output cell
                    # Convolution cycles
                    for channel in range(channels): # channels == in_ch
                        for i_kh in range(k_height):
                            input_y = (i_nih * stride) + i_kh # get the y location, the height
                            for i_kw in range(k_width):
                                input_x = (i_niw * stride) + i_kw # get the x location, the width
                                # check that everything stays in the measures
                                if 0 <= input_y < i_height and 0 <= input_x < i_width:
                                    input_val = img[one_img, channel, input_y, input_x]
                                    kernel_val = ker[one_k_channel, channel, i_kh, i_kw]
                                    current_sum += (input_val * kernel_val).astype(np.float32)
                    ni[one_img, one_k_channel, i_nih, i_niw] = current_sum
    if bias.all() != 0:
        bias = bias.reshape(bias.shape[0],1,1)
        if bias.shape[0] != out_ch:
            raise ValueError(f"bias dimension ({bias.shape[0]}) doesn't match kernel's number of channels ({out_ch})")
        ni = ni + bias
    ni = ni.astype(np.float32)
    if applyReLU:
        ni = np.maximum(0, ni)
        mask = ni.copy()
        mask[mask > 0] = 1
        return ni,mask
    else:
        return ni
#-------------------------------------------- Examples --------------------------------------------------------
img = np.arange(1,18+1).reshape(2,1,3,3).astype(np.float32)
print("-------img-------")
print(img)
ker = np.array([1,2,3,4]).reshape(1,1,2,2)
print("-------ker-------")
print(ker)
bias = np.array([0]).reshape(1,1,1)
res,mask = Slow_ReLU_Conv(img,ker,bias,pad=0,stride=1)
print("-------Conv Slow-------")
print(res)
# print("------mask-------")
# print(mask)


my_kernel = torch.from_numpy(ker).float()

my_bias = torch.from_numpy(np.array([1,2])).float()

modelC = CustomConv(kernel=my_kernel,bias=my_bias, stride=1, padding=0)

# input di prova (batch=1, canali=1, H=5, W=5)
x = torch.from_numpy(img)
y = modelC(x)
print("-------Conv PyTorch-------")
print(y)

-------img-------
[[[[ 1.  2.  3.]
   [ 4.  5.  6.]
   [ 7.  8.  9.]]]


 [[[10. 11. 12.]
   [13. 14. 15.]
   [16. 17. 18.]]]]
-------ker-------
[[[[1 2]
   [3 4]]]]
-------Conv Slow-------
[[[[ 37.  47.]
   [ 67.  77.]]]


 [[[127. 137.]
   [157. 167.]]]]


RuntimeError: output with shape [1] doesn't match the broadcast shape [2]

### Slow Convolution Layer: Backward

**Actors:**
1. W is the kernel
2. $\delta$ is the gradient
3. x is the input to the convolution layer during forward
4. b is the bias

**Steps:**

- **Derive delta**

Deriving delta with respect to ReLU activation consists in the hadamard product (element-wise product) of the gradient ($\delta$) and the mask obtained at the forward step, that is, all the elements in the convolved image greater than zero are put to one, the rest is zero.
$$
\delta^{(i)} = \delta_{\text{flat reshaped}} \cdot \text{mask}
$$

- **Gradient with respect to W**:

$$
\frac{\partial L}{\partial W^{(i)}} = \text{Convolution}(x^{(i)}, \delta)
$$
This convolution creates a matrix for every channel of input image $x^{i}$ and for every channel of output image $\delta$, thus resulting in the correct number of channels

- **Gradient w.r.t. the input \( x \)** (To go to the preceding layer):

$$
\delta^{(i-1)} = \text{Full\_Convolution}(\delta^{(i)}, W^{(i)})
$$

- **Gradient w.r.t the bias**

Since the bias is added equally across the spatial dimensions of each output channel, the gradient is the sum of all elements in each output channel:

$$
\frac{\partial L}{\partial b^{(i)}_c} = \sum_{h,w} \delta^{(i)}_{c,h,w}
$$

For batched inputs, sum also across the batch dimension:

$$
\frac{\partial L}{\partial b^{(i)}_c} = \sum_{n,h,w} \delta^{(i)}_{n,c,h,w}
$$

In [18]:
def Slow_ReLU_Gradient(img,d_img,ker,mask,pad=0,stride=1):
    """
    NEW APPROACH !
    Performs the backward pass of the convolution layer. It takes the original image, 
    the gradient image, and then the kernel, padding and stride used in the convolution. Also the mask is needed to perform the ReLU operation.
    It returns the gradient w.r.t. the Original Image to back propagate and the gradient of the kernel
    """ 
    ############################################# Gradient of Input Image ####################################
    # The computation consists in a convolution where the image is the gradient of the output image delated (zeros between matrix elements) of stride-1
    # and padded of kernel-1 dimensions 
    # and the kernel 180 degrees rotation (flipped vertically and then horizontally)
    # FullConvolution(d_imgDelated, Rotated180Deg(kernel)) with stride 1
    out_ch, in_ch, k_height, k_width = ker.shape
    batch_s, in_ch, img_height, img_width = img.shape

    # backward ReLU
    d_img = np.multiply(d_img,mask)

    # Delating the gradient of output
    if stride == 2:
        d_img = dilateOne(d_img)
    elif stride > 2:
        raise ValueError(f"Stride greater than 2 is not acceptable")
    d_imgPadded = np.pad(d_img,((0,0),(0,0),(k_height-1-pad,k_height-1-pad),(k_width-1-pad,k_width-1-pad)))
    batch_s, out_ch, dimg_height, dimg_width = d_img.shape
    
    # flipping the kernel
    ker180 = np.rot90(ker,2,(-2,-1))

    # Computation
    gi = np.zeros_like(img)
    current_sum = 0.0
    for bs in range(batch_s):
        for i_gih in range(img_height):
            for i_giw in range(img_width):
                for i_outch in range(out_ch):
                    for i_inch in range(in_ch):
                        for i_kh in range(k_height):
                            y = i_gih + i_kh
                            for i_kw in range(k_width):
                                x = i_giw + i_kw

                                if 0 <= y < d_imgPadded.shape[-2] and 0 <= x < d_imgPadded.shape[-1]:
                                    input_val = d_imgPadded[bs,i_outch,y,x]
                                    ker_val = ker180[i_outch,i_inch,i_kh,i_kw] 
                                else:
                                    break
                                current_sum += input_val*ker_val
                    gi[bs,i_inch,i_gih,i_giw] = current_sum
                    current_sum = 0.0

    ############################################# Gradient of Kernel ####################################
    # The computation consists in a convolution between the original image and the delated gradient of the output image in order to
    # find the kernel
    gk = np.zeros_like(ker)
    img = np.pad(img,((0,0),(0,0),(pad,pad),(pad,pad)))
    current_sum = 0.0
    for bs in range(batch_s):
        for i_gih in range(k_height):
            for i_giw in range(k_width):
                for i_inch in range(in_ch):
                    for i_outch in range(out_ch):
                        for i_kh in range(dimg_height):
                            y = i_gih + i_kh
                            for i_kw in range(dimg_width):
                                x = i_gih + i_kw
                                if 0 <= y < img_height and 0 <= x < img_width:
                                    input_val = img[bs,i_inch,y,x]
                                    ker_val = d_img[bs,i_outch,i_kh,i_kw] 
                                    current_sum += input_val*ker_val
                                else:
                                    break
                        gk[i_outch,i_inch,i_gih,i_giw] = current_sum
                        current_sum = 0.0

    ############################################# Gradient of Bias ####################################
    # The computation consists in summing the gradient of the output image together to find the bias for every channel
    gb = d_img.sum((-1,-2)) # sum over height and width
    
    ################################################### Return Results ###############################################
    return gi,gk,gb

in_ch = 1
out_ch = 2
idim = 7
kdim = 2
s = 2
p = 1
imAge = np.arange(1,1*in_ch*idim*idim+1).reshape(1,in_ch,idim,idim)
kerNel = np.arange(1,out_ch*in_ch*(kdim**2)+1).reshape(out_ch,in_ch,kdim,kdim)
dimAge,mask = Slow_ReLU_Conv(imAge,kerNel,stride=s,pad=p) 
dimAge = dimAge/np.mean(dimAge)
ggi,ggk,ggb = Slow_ReLU_Gradient(imAge,dimAge,kerNel,mask,stride=s,pad=p)
print(f"imAge: {imAge.shape}")
print(f"kerNel: {kerNel.shape}")
print(f"dimAge: {dimAge.shape}")
print(f"ggi: {ggi.shape}")
print(f"ggk: {ggk.shape}")

imAge: (1, 1, 7, 7)
kerNel: (2, 1, 2, 2)
dimAge: (1, 2, 4, 4)
ggi: (1, 1, 7, 7)
ggk: (2, 1, 2, 2)


In [19]:
import numpy as np

def Slow_ReLU_Gradient(img_fwd, d_out_values, ker, mask_relu_fwd, pad_fwd=0, stride_fwd=1):
    """
    Calcola i gradienti per un layer Conv+ReLU.
    img_fwd: input del forward pass.
    d_out_values: gradiente dell'output del layer (dopo ReLU).
    ker: kernel del forward pass.
    mask_relu_fwd: maschera della ReLU del forward (attivazioni > 0).
    pad_fwd: padding usato nel forward.
    stride_fwd: stride usato nel forward.
    Restituisce: grad_input, grad_kernel, grad_bias.
    """
    batch_s, in_ch, img_height, img_width = img_fwd.shape
    out_ch, _, k_height, k_width = ker.shape
    _, _, out_h_fwd, out_w_fwd = d_out_values.shape # Dimensioni dell'output del forward

    # 0. Normalizza pad e stride se sono interi
    if isinstance(pad_fwd, int):
        pad_fwd_h, pad_fwd_w = pad_fwd, pad_fwd
    else:
        pad_fwd_h, pad_fwd_w = pad_fwd

    if isinstance(stride_fwd, int):
        stride_fwd_h, stride_fwd_w = stride_fwd, stride_fwd
    else:
        stride_fwd_h, stride_fwd_w = stride_fwd

    # 1. Backward pass attraverso la ReLU
    # Moltiplica il gradiente in arrivo per la maschera della derivata della ReLU.
    # dL/d(output_conv) = dL/d(output_relu) * ReLU'(input_relu)
    # d_conv_out è il gradiente rispetto all'output della convoluzione (prima della ReLU)
    d_conv_out = np.multiply(d_out_values, mask_relu_fwd)

    # 2. Calcolo del gradiente rispetto ai BIAS (dL/db)
    # Somma d_conv_out (gradiente dopo backward ReLU) su batch, altezza e larghezza per ogni canale di output.
    # grad_bias ha shape (out_ch,)
    grad_bias = np.sum(d_conv_out, axis=(0, 2, 3))

    # 3. Calcolo del gradiente rispetto al KERNEL (dL/dk)
    # Questo è una convoluzione tra l'input del forward (img_fwd, paddato come nel forward)
    # e il gradiente d_conv_out (dilatato se stride_fwd > 1).
    # PyTorch lo fa come: correlate(input_paddato, grad_output_dilatato)
    
    # Dilatazione di d_conv_out se stride > 1 (per il calcolo di dL/dk e dL/din)
    # d_conv_out_dilated = d_conv_out # Inizializza
    # if stride_fwd_h > 1 or stride_fwd_w > 1:
        # Qui è necessaria una funzione di dilatazione che inserisca stride-1 zeri
        # tra gli elementi di d_conv_out. Per semplicità, se la tua 'delateOne'
        # funziona per stride=2, la usiamo. Altrimenti, va implementata.
        # Questa dilatazione è usata specificamente per il calcolo del gradiente dell'input.
        # Per il gradiente del kernel, PyTorch usa l'input originale e il d_conv_out non dilatato,
        # ma con uno "stride" nella patch di input (o "dilation" nel kernel della convoluzione per dL/dk).

    # Usiamo un approccio standard: convolvere l'input (paddato) con d_conv_out (non dilatato)
    # L'input al layer convoluzionale, img_fwd, deve essere paddato come nel forward.
    img_fwd_padded = np.pad(img_fwd,
                            ((0, 0), (0, 0), (pad_fwd_h, pad_fwd_h), (pad_fwd_w, pad_fwd_w)),
                            mode='constant', constant_values=0)

    grad_kernel = np.zeros_like(ker)
    for n in range(batch_s): # Somma i contributi di ogni campione del batch
        for c_out in range(out_ch):
            for c_in in range(in_ch):
                for r_k in range(k_height):
                    for c_k in range(k_width):
                        val = 0.0
                        for r_o in range(out_h_fwd): # Altezza dell'output del forward
                            for c_o in range(out_w_fwd): # Larghezza dell'output del forward
                                # Coordinate nell'input paddato che hanno contribuito a output[n, c_out, r_o, c_o]
                                # usando kernel[c_out, c_in, r_k, c_k]
                                img_r = r_o * stride_fwd_h + r_k
                                img_c = c_o * stride_fwd_w + c_k
                                val += img_fwd_padded[n, c_in, img_r, img_c] * d_conv_out[n, c_out, r_o, c_o]
                        grad_kernel[c_out, c_in, r_k, c_k] += val


    # 4. Calcolo del gradiente rispetto all'INPUT (dL/din)
    # Questo è concettualmente una "full convolution" o "transposed convolution".
    # Si può implementare come una convoluzione standard:
    #   Input della convoluzione: d_conv_out (dilatato e paddato in modo speciale)
    #   Kernel della convoluzione: ker (ruotato di 180 gradi)
    
    # Dilatazione di d_conv_out: inserire (stride - 1) zeri
    d_conv_out_dilated = d_conv_out # Default per stride=1
    if stride_fwd_h > 1 or stride_fwd_w > 1:
        # Creazione di un array più grande per la dilatazione
        dil_h = out_h_fwd + (out_h_fwd - 1) * (stride_fwd_h - 1)
        dil_w = out_w_fwd + (out_w_fwd - 1) * (stride_fwd_w - 1)
        d_conv_out_dilated = np.zeros((batch_s, out_ch, dil_h, dil_w), dtype=d_conv_out.dtype)
        d_conv_out_dilated[:, :, ::stride_fwd_h, ::stride_fwd_w] = d_conv_out

    # Padding per la "full convolution" (convoluzione trasposta)
    # p' = k - 1 - p_fwd
    pad_tp_h = k_height - 1 - pad_fwd_h
    pad_tp_w = k_width - 1 - pad_fwd_w
    
    # Assicurati che il padding non sia negativo (può succedere se p_fwd è grande)
    # In tal caso, si potrebbe dover "croppare" invece di paddare, o aggiustare.
    # PyTorch gestisce questo con output_padding nella ConvTranspose.
    # Per ora, assumiamo pad_tp_h/w >= 0. Se sono negativi, la logica di np.pad non va bene.
    # Considera il padding necessario per ottenere le dimensioni di img_fwd.
    # Le dimensioni dell'output di questa convoluzione devono essere img_height, img_width.
    # Se out_dil = (H_in - K + 2P_tp)/S_tp + 1, con S_tp=1.  H_out_dil = H_img_fwd
    # H_img_fwd = H_d_conv_out_dilated - K + 2P_tp + 1
    # 2P_tp = H_img_fwd - H_d_conv_out_dilated + K - 1
    # P_tp = (H_img_fwd - H_d_conv_out_dilated + K - 1) / 2
    # Questo è più complesso. Il padding p' = K-1-p_fwd è un modo comune di pensarlo.
    # Se p' è negativo, significa che dopo la convoluzione, il risultato è più grande dell'input originale
    # e andrebbe croppato. np.pad non gestisce padding negativo.

    # Per semplicità, se pad_tp_h o pad_tp_w sono negativi, impostali a 0 e si dovrà croppare dopo.
    # Questo è un punto delicato.
    effective_pad_tp_h_pre = max(0, pad_tp_h)
    effective_pad_tp_w_pre = max(0, pad_tp_w)
    effective_pad_tp_h_post = max(0, pad_tp_h) # Simmetrico per ora
    effective_pad_tp_w_post = max(0, pad_tp_w) # Simmetrico per ora
    
    # Se usi la formula del padding per conv transpose p' = k - 1 - p_fwd, e il risultato
    # della convoluzione è più grande, allora bisogna fare un crop.
    # Calcoliamo le dimensioni attese dell'output della convoluzione (senza crop/pad finale)
    # H_conv_out = d_conv_out_dilated.shape[2] - k_height + 2*effective_pad_tp_h + 1
    # W_conv_out = d_conv_out_dilated.shape[3] - k_width + 2*effective_pad_tp_w + 1
    # Se queste non corrispondono a img_height/img_width, è necessario un aggiustamento (crop o padding aggiuntivo).

    d_conv_out_dilated_padded = np.pad(d_conv_out_dilated,
                                       ((0,0), (0,0), (effective_pad_tp_h_pre, effective_pad_tp_h_post), (effective_pad_tp_w_pre, effective_pad_tp_w_post)),
                                       mode='constant', constant_values=0)

    # Kernel ruotato di 180 gradi
    # ker [out_ch, in_ch, k_h, k_w] -> ker_flipped [in_ch, out_ch, k_h, k_w] per conv_transpose
    # O ker_flipped [out_ch, in_ch, k_h, k_w] e si itera diversamente.
    # PyTorch usa il kernel originale ma riarrangia i canali.
    # Per una convoluzione standard, il kernel è [out_channels_conv, in_channels_conv, kh, kw]
    # L'input è d_conv_out_dilated_padded (in_channels_conv = out_ch originali)
    # L'output è grad_input (out_channels_conv = in_ch originali)
    # Quindi il kernel per questa convoluzione deve avere shape [in_ch, out_ch, k_h, k_w]
    # e i suoi valori sono quelli di ker originale, ma flippati e con i canali scambiati.
    
    kernel_for_grad_input = np.zeros((in_ch, out_ch, k_height, k_width), dtype=ker.dtype)
    for c_o in range(out_ch):
        for c_i in range(in_ch):
            kernel_for_grad_input[c_i, c_o, :, :] = ker[c_o, c_i, ::-1, ::-1] # Flip spaziale

    grad_input = np.zeros_like(img_fwd)
    # Convoluzione standard: grad_input = conv(d_conv_out_dilated_padded, kernel_for_grad_input)
    # (stride di questa convoluzione è 1)
    for n in range(batch_s):
        for c_i_gi in range(in_ch): # Canale di output di questa convoluzione
            for r_gi in range(img_height): # Riga nell'output grad_input
                for c_gi in range(img_width): # Colonna nell'output grad_input
                    val = 0.0
                    for c_o_dcnv in range(out_ch): # Canale di input di questa convoluzione
                        for r_k in range(k_height):
                            for c_k in range(k_width):
                                # Coordinate nell'input d_conv_out_dilated_padded
                                # Stride qui è 1
                                dcnv_r = r_gi + r_k
                                dcnv_c = c_gi + c_k
                                # Verifica confini per d_conv_out_dilated_padded
                                if 0 <= dcnv_r < d_conv_out_dilated_padded.shape[2] and \
                                   0 <= dcnv_c < d_conv_out_dilated_padded.shape[3]:
                                    val += d_conv_out_dilated_padded[n, c_o_dcnv, dcnv_r, dcnv_c] * \
                                           kernel_for_grad_input[c_i_gi, c_o_dcnv, r_k, c_k]
                    grad_input[n, c_i_gi, r_gi, c_gi] = val
    
    # Eventuale cropping di grad_input se il padding ha prodotto dimensioni maggiori
    # Questo è un punto complesso da generalizzare senza conoscere le esatte dimensioni
    # risultanti dal padding. Se il padding `p' = k-1-p_fwd` è stato usato correttamente,
    # le dimensioni dovrebbero corrispondere. Se sono stati usati `max(0, p')`,
    # allora `grad_input` potrebbe essere più piccolo e non necessitare crop, oppure
    # il calcolo del padding deve essere più preciso per garantire la dimensione corretta.
    # Se `img_height = (d_conv_out_dilated_padded.shape[2] - k_height)/1 + 1`, allora ok.

    return grad_input, grad_kernel, grad_bias

in_ch = 1
out_ch = 2
idim = 7
kdim = 2
s = 2
p = 1
imAge = np.arange(1,1*in_ch*idim*idim+1).reshape(1,in_ch,idim,idim)
kerNel = np.arange(1,out_ch*in_ch*(kdim**2)+1).reshape(out_ch,in_ch,kdim,kdim)
dimAge,mask = Slow_ReLU_Conv(imAge,kerNel,stride=s,pad=p) 
dimAge = dimAge/np.mean(dimAge)
ggi,ggk,ggb = Slow_ReLU_Gradient(imAge,dimAge,kerNel,mask,stride_fwd=s,pad_fwd=p)
print(f"imAge: {imAge.shape}")
print(f"kerNel: {kerNel.shape}")
print(f"dimAge: {dimAge.shape}")
print(f"ggi: {ggi.shape}")
print(f"ggk: {ggk.shape}")

imAge: (1, 1, 7, 7)
kerNel: (2, 1, 2, 2)
dimAge: (1, 2, 4, 4)
ggi: (1, 1, 7, 7)
ggk: (2, 1, 2, 2)


### Fast Convolution Layer: Forward

In [20]:
def Fast_ReLU_Conv(batch_of_images,kernel,bias=np.array(0),pad=0,stride=1,applyReLU=True):
    kc, ac, kw, kh = kernel.shape # number of kernels, number of input channels, kernel width and kernel height
    # im2col: Window creation
    batch_of_images = np.pad(batch_of_images,((0,0),(0,0),(pad,pad),(pad,pad)))
    bs, nc, iw, ih = batch_of_images.shape # batch of images' number of images, number of channels, single image's width, single images's height
    window_m = np.lib.stride_tricks.sliding_window_view(batch_of_images,(1,nc,kw,kh))[:,:,::stride,::stride].reshape((-1,(kw*kh*nc))) # window matrix
    # Convolution
    kernel = kernel.reshape((-1,(kw*kh*nc))).transpose(1,0)
    c_m = (window_m @ kernel).astype(np.float32) # convolved image matrix
    # ReLU activation
    nih = int(((ih-kh) / stride) + 1) # new image height # Padding is already added
    niw = int(((iw-kw) / stride) + 1) # new image width
    # First operate a reshape keeping spatial ordering, which has channels at the end
    output_temp = c_m.reshape(bs, nih, niw, kc)
    # Transpose to have input in shapes (batch, output_channel, height, width)
    reshaped_correct_order = output_temp.transpose(0,3,1,2).astype(np.float32)
    if bias.any() != 0:
        reshaped_correct_order = (reshaped_correct_order + bias.reshape(1,-1,1,1))
    if applyReLU:
        reshaped_correct_order = np.maximum(0,reshaped_correct_order)
    mask = np.copy(reshaped_correct_order)
    mask[mask>0]=1
    return reshaped_correct_order,mask



img = np.arange(1,2*3*3+1).reshape(1,2,3,3).astype(np.float32)
# print("-------img-------")
# print(img)
ker = np.arange(1,16+1).reshape(2,2,2,2)
# print("-------ker-------")
# print(ker)
bias = np.array([1,2]).reshape(2,1,1)
res,mask = Slow_ReLU_Conv(img,ker,bias,pad=0,stride=1)
print("-------Conv Slow-------")
print(res)
X_c,mask = Fast_ReLU_Conv(img,ker,bias,pad = 0,stride=1)
print("-------Conv Fast-------")
print(X_c)
res,mask = Slow_ReLU_Conv(res,ker,bias,pad=0,stride=1)
print("-------Conv Slow-------")
print(res)
X_c,mask = Fast_ReLU_Conv(X_c,ker,bias,pad = 0,stride=1)
print("-------Conv Fast-------")
print(X_c)

-------Conv Slow-------
[[[[ 357.  393.]
   [ 465.  501.]]

  [[ 838.  938.]
   [1138. 1238.]]]]
-------Conv Fast-------
[[[[ 357.  393.]
   [ 465.  501.]]

  [[ 838.  938.]
   [1138. 1238.]]]]
-------Conv Slow-------
[[[[32231.]]

  [[79176.]]]]
-------Conv Fast-------
[[[[32231.]]

  [[79176.]]]]


Dimensional reshape test to see if everything goes into the right place (it does)

In [21]:
import numpy as np
s = 1
p = 0
in_ch = 3
out_ch = 4
i_dim = 3
k_dim = 2
img = np.arange(1,i_dim*in_ch*i_dim+1).reshape(1,in_ch,i_dim,i_dim)
ker = np.arange(1,out_ch*k_dim*in_ch*k_dim+1).reshape(out_ch,in_ch,k_dim,k_dim)
d_img,mask = Fast_ReLU_Conv(img,ker,stride=s,pad=p)
print(d_img)
_,_,dimg_height,dimg_width = d_img.shape
windom_pimg = np.lib.stride_tricks.sliding_window_view(img,(1,1,dimg_height,dimg_width)).reshape(-1,dimg_height*dimg_width)
print(windom_pimg)
print(d_img.reshape(-1,dimg_height*dimg_width))
d_img = d_img.reshape(-1,dimg_height*dimg_width).transpose(1,0)
print(d_img)
iop = windom_pimg @ d_img
print(iop)
print(iop.transpose(1,0).reshape(out_ch,in_ch,k_dim,kdim))

[[[[1245. 1323.]
   [1479. 1557.]]

  [[2973. 3195.]
   [3639. 3861.]]

  [[4701. 5067.]
   [5799. 6165.]]

  [[6429. 6939.]
   [7959. 8469.]]]]
[[ 1  2  4  5]
 [ 2  3  5  6]
 [ 4  5  7  8]
 [ 5  6  8  9]
 [10 11 13 14]
 [11 12 14 15]
 [13 14 16 17]
 [14 15 17 18]
 [19 20 22 23]
 [20 21 23 24]
 [22 23 25 26]
 [23 24 26 27]]
[[1245. 1323. 1479. 1557.]
 [2973. 3195. 3639. 3861.]
 [4701. 5067. 5799. 6165.]
 [6429. 6939. 7959. 8469.]]
[[1245. 2973. 4701. 6429.]
 [1323. 3195. 5067. 6939.]
 [1479. 3639. 5799. 7959.]
 [1557. 3861. 6165. 8469.]]
[[ 17592.  43224.  68856.  94488.]
 [ 23196.  56892.  90588. 124284.]
 [ 34404.  84228. 134052. 183876.]
 [ 40008.  97896. 155784. 213672.]
 [ 68028. 166236. 264444. 362652.]
 [ 73632. 179904. 286176. 392448.]
 [ 84840. 207240. 329640. 452040.]
 [ 90444. 220908. 351372. 481836.]
 [118464. 289248. 460032. 630816.]
 [124068. 302916. 481764. 660612.]
 [135276. 330252. 525228. 720204.]
 [140880. 343920. 546960. 750000.]]
[[[[ 17592.  23196.]
   [ 34404.  4

### Fast Convolution Layer: Backward

In [None]:
def Fast_ReLU_Gradient(img,d_img,ker,mask,pad=0,stride=1):
    """
    NEW APPROACH !
    Performs the backward pass of the convolution layer. It takes the original image, 
    the gradient image, and then the kernel, padding and stride used in the convolution. Also the mask is needed to perform the ReLU operation.
    It returns the gradient w.r.t. the Original Image to back propagate and the gradient of the kernel
    """ 
    ############################################# Gradient of Input Image ####################################
    # The computation consists in a convolution where the image is the gradient of the output image delated (zeros between matrix elements) of stride-1
    # and padded of kernel-1 dimensions 
    # and the kernel 180 degrees rotation (flipped vertically and then horizontally)
    # FullConvolution(d_imgDelated, Rotated180Deg(kernel)) with stride 1
    out_ch, in_ch, k_height, k_width = ker.shape
    batch_s, in_ch, img_height, img_width = img.shape

    # backward ReLU
    d_img = np.multiply(d_img,mask)

    # Delating the gradient of output
    if stride == 2:
        d_img = dilateOne(d_img)
    elif stride > 2:
        raise ValueError(f"Stride greater than 2 is not acceptable")
    d_imgPadded = np.pad(d_img,((0,0),(0,0),(k_height-1-pad,k_height-1-pad),(k_width-1-pad,k_width-1-pad)))

    batch_s, out_ch, dimg_height, dimg_width = d_img.shape
    
    # flipping the kernel
    ker180 = np.rot90(ker,2,(-2,-1))

    window_dPad = np.lib.stride_tricks.sliding_window_view(d_imgPadded,(1,out_ch,k_width,k_height)).reshape(-1,(k_width*k_height*out_ch)) # window matrix
    
    # Convolution
    ker = ker180.reshape((-1,(k_width*k_height*out_ch))).transpose(1,0)

    gi = (window_dPad @ ker).astype(np.float32).transpose(1,0).reshape(img.shape)
    
    ############################################# Gradient of Kernel ####################################
    # The computation consists in a convolution between the original image and the delated gradient of the output image in order to
    # find the kernel
    imgPad = np.pad(img,((0,0),(0,0),(pad,pad),(pad,pad)))
    windom_pimg = np.lib.stride_tricks.sliding_window_view(imgPad,(1,1,dimg_height,dimg_width)).reshape(-1,dimg_height*dimg_width)
    d_img = d_img.reshape(-1,dimg_height*dimg_width).transpose(1,0)
    gk = (windom_pimg @ d_img).astype(np.float32).transpose(1,0).reshape(out_ch,in_ch,k_height,k_width)
    ############################################# Gradient of Bias ####################################
    # The computation consists in summing the gradient of the output image together to find the bias for every channel
    gb = d_img.sum((-1,-2)) # sum over height and width
    
    ################################################### Return Results ###############################################
    return gi,gk,gb
in_ch = 3
out_ch = 4
idim = 5
kdim = 3
s = 2
p = 0
imAge = np.arange(1,1*in_ch*idim*idim+1).reshape(1,in_ch,idim,idim)
kerNel = np.arange(1,out_ch*in_ch*(kdim**2)+1).reshape(out_ch,in_ch,kdim,kdim)
dimAge,mask = Fast_ReLU_Conv(imAge,kerNel,stride=s,pad=p) 
dimAge = dimAge/np.mean(dimAge)
print(f"dimAge: {dimAge.shape}")

ggi,ggk,ggb = Fast_ReLU_Gradient(imAge,dimAge,kerNel,mask,stride=s,pad=p)
print(f"imAge: {imAge.shape}")
print(f"kerNel: {kerNel.shape}")
print(f"dimAge: {dimAge.shape}")
print(f"ggi: {ggi.shape}")
print(f"ggk: {ggk.shape}")

# in_ch = 1
# out_ch = 2
# idim = 7
# kdim = 2
# s = 1
# p = 0

# imAge: (1, 1, 7, 7)
# kerNel: (2, 1, 2, 2)
# dimAge: (1, 2, 6, 6)
# ggi: (1, 1, 7, 7)
# ggk: (2, 1, 2, 2)

# in_ch = 1
# out_ch = 2
# idim = 7
# kdim = 2
# s = 2
# p = 1

# imAge: (1, 1, 7, 7)
# kerNel: (2, 1, 2, 2)
# dimAge: (1, 2, 4, 4)
# ggi: (1, 1, 7, 7)
# ggk: (2, 1, 2, 2)

dimAge: (1, 4, 2, 2)


NameError: name 'delateOne' is not defined

### MLP Layer: Forward

In [None]:
def softmax(x):
    e_x = np.exp(x-np.max(x,axis=-1,keepdims=True))  # for numerical stability
    return e_x / np.sum(e_x,axis=-1,keepdims=True)

def ReLU_SoftMax_FullyConnected(input_array,w1,b1,w2,b2):
    fl = (input_array @ w1)+b1 # first layer
    fa = np.maximum(0,fl) # first activation: ReLU
    sl = (fa @ w2)+b2 # second layer
    sa = softmax(sl) # second activation: SoftMax
    return fl,fa,sl,sa

[0.00235563 0.04731416 0.95033021]


### MLP Layer: Backward

In [None]:
def ReLU_SoftMax_FC_Backward(bs,pred,labels,w1,w2,fa,fl,i_mlp):
    dL_dz2 = pred-labels
    dL_dw2 = fa.T @ dL_dz2
    dL_db2 = np.sum(dL_dz2, axis=0)
    dL_dfa = dL_dz2 @ w2.T
    dReLU = (fl > 0).astype(float)
    dL_dfl = dL_dfa * dReLU
    dL_dw1 = i_mlp.reshape(bs, -1).T @ dL_dfl
    dL_db1 = np.sum(dL_dfl, axis=0)
    dL_i_mlp = dL_dfl @ w1.T
    return dL_i_mlp,dL_dw1,dL_db1,dL_dw2,dL_db2

### Loss Function: Categorical Cross-Entropy

In [None]:
def softmax(x):
    e_x = np.exp(x-np.max(x,axis=-1,keepdims=True))  # for numerical stability
    return e_x / np.sum(e_x,axis=-1,keepdims=True)
def crossEntropy(probabilities, true_labels_one_hot):
    # probabilities: (N, C), output della softmax
    # true_labels_one_hot: (N, C), etichette one-hot
    
    N = probabilities.shape[0] # Dimensione del batch
    epsilon = 1e-9 # Piccolo valore per stabilità

    # Calcola la cross-entropy per ogni campione
    # log_likelihood = -np.sum(true_labels_one_hot * np.log(probabilities + epsilon), axis=1)
    # Oppure, se true_labels_one_hot è veramente one-hot, puoi selezionare direttamente la probabilità della classe corretta
    # Questo è più efficiente se hai gli indici delle classi vere:
    # log_probs = np.log(probabilities + epsilon)
    # true_class_indices = np.argmax(true_labels_one_hot, axis=1)
    # log_likelihood = -log_probs[np.arange(N), true_class_indices]

    # Metodo generale con etichette one-hot:
    # Moltiplica elemento per elemento e somma lungo l'asse delle classi (axis=1)
    # Questo seleziona efficacemente log(p_k) per la classe corretta k, dato che t_j è 0 per j!=k e t_k=1
    log_likelihood_per_sample = -np.sum(true_labels_one_hot * np.log(probabilities + epsilon), axis=1)
    
    # La loss totale è solitamente la media delle loss per campione
    mean_loss = np.sum(log_likelihood_per_sample) / N
    
    # Se vuoi solo la somma (come fa PyTorch di default se reduction='sum')
    # sum_loss = np.sum(log_likelihood_per_sample)
    
    return mean_loss # o sum_loss, a seconda di cosa vuoi confrontare con PyTorch

#c = [1,1000000000000000,1,1]
#c = softmax(c)
#print(c)
#c = crossEntropy(c,[0,1,0,0])
#print(c)

## Inference

In this section the three implementations will be compared in terms of time. Recall that all the predictions should be the same since the weights are the same.

In [None]:
import time
from tqdm import tqdm

np_k1 = numpy_weights['k1'].astype(np.float32)
np_b_conv1 = numpy_weights['b_conv1'].astype(np.float32)
np_k2 = numpy_weights['k2'].astype(np.float32)
np_b_conv2 = numpy_weights['b_conv2'].astype(np.float32)
np_k3 = numpy_weights['k3'].astype(np.float32)
np_b_conv3 = numpy_weights['b_conv3'].astype(np.float32)
np_w1 = numpy_weights['w1'].astype(np.float32)
np_b1 = numpy_weights['b1'].astype(np.float32)
np_w2 = numpy_weights['w2'].astype(np.float32)
np_b2 = numpy_weights['b2'].astype(np.float32)

dict_times={}
dict_times["ctorch"]=[]
dict_times["cslow"]=[]
dict_times["cfast"]=[]

dict_pred={}
dict_pred["ctorch"]=[]
dict_pred["cslow"]=[]
dict_pred["cfast"]=[]

#length = test_labels.shape[0]
# length = 100
length = 1
correct = 0
skip = True
step = 2
count = 0
loop = tqdm(range(0,length,step),desc=" Inferring...")
for i in loop:
    c0 = test_images[i:i+step].reshape(step,1,28,28).astype(np.float32)
    torch_c0 = torch.from_numpy(c0).float()
    ############### CNN PyTorch Implementation ##################
    start_time = time.time()
    outputs = model(torch_c0)
    end_time = time.time()
    _, predicted1 = torch.max(outputs.data, 1)
    dict_times["ctorch"].append(end_time-start_time)
    dict_pred["ctorch"].append(np.array(predicted1))
    ############### CNN Slow Implementation #####################
    start_time = time.time()
    c1s,mask1s = Slow_ReLU_Conv(c0.astype(np.float32),np_k1,np_b_conv1,pad=0,stride=2)
    c2s,mask2s = Slow_ReLU_Conv(c1s.astype(np.float32),np_k2,np_b_conv2,pad=1,stride=2)
    c3s,mask3s = Slow_ReLU_Conv(c2s.astype(np.float32),np_k3,np_b_conv3,pad=0,stride=2)
    imlps = c3s.reshape(step,-1)
    _,_,_,res = ReLU_SoftMax_FullyConnected(imlps,np_w1,np_b1,np_w2,np_b2)
    predicted2 = np.argmax(res,1)
    end_time = time.time()
    dict_times["cslow"].append(end_time-start_time)
    dict_pred["cslow"].append(np.array(predicted2))
    ############### CNN Fast Implementation #####################
    start_time = time.time()
    c1f,mask1f = Fast_ReLU_Conv(c0.astype(np.float32),np_k1,np_b_conv1,pad=0,stride=2)
    c2f,mask2f = Fast_ReLU_Conv(c1f.astype(np.float32),np_k2,np_b_conv2,pad=1,stride=2)
    c3f,mask3f = Fast_ReLU_Conv(c2f.astype(np.float32),np_k3,np_b_conv3,pad=0,stride=2)
    imlpf = c3f.reshape(step,-1)
    _,_,_,res = ReLU_SoftMax_FullyConnected(imlpf,np_w1,np_b1,np_w2,np_b2)
    predicted3 = np.argmax(res,1)
    end_time = time.time()
    dict_times["cfast"].append(end_time-start_time)
    dict_pred["cfast"].append(np.array(predicted3))
    #####################################################################################
    #### Check that outputs of Slow Approach and Fast Approach have the same results ###
    t = np.array(predicted1)
    s = np.array(predicted2)
    f = np.array(predicted3)
    if t.all() == s.all() and t.all() == f.all():
        correct+=1
    count+=1
    #####################################################################################
    ### Keep track of the times #########################################################
    tat = round(sum(dict_times['ctorch'])/(i+1),4)
    sat = round(sum(dict_times['cslow'])/(i+1),4)
    fat = round(sum(dict_times['cfast'])/(i+1),4)
    loop.set_postfix(average_times =f"t: {tat} s, s: {sat} s, f: {fat} s" , correct_predictions=f"{100*correct/count}%")
tat = round(sum(dict_times['ctorch'])/length,4)
sat = round(sum(dict_times['cslow'])/length,4)
fat = round(sum(dict_times['cfast'])/length,4)
print(f"Average forward execution time in seconds: \nPyTorch: {tat} s, \nSlow: {sat} s, \nFast: {fat} s")
print("Verifica pesi conv1:")
print("  np_k1 vs model.conv1.weight:", np.allclose(np_k1, model.conv1.weight.data.cpu().numpy()))
print("  np_bc1 vs model.conv1.bias:",  np.allclose(np_b_conv1, model.conv1.bias.data.cpu().numpy()))
print("  np_k1 vs model.conv1.weight:", np.allclose(np_k2, model.conv2.weight.data.cpu().numpy()))
print("  np_bc1 vs model.conv1.bias:",  np.allclose(np_b_conv2, model.conv2.bias.data.cpu().numpy()))
print("  np_k1 vs model.conv1.weight:", np.allclose(np_k3, model.conv3.weight.data.cpu().numpy()))
print("  np_bc1 vs model.conv1.bias:",  np.allclose(np_b_conv3, model.conv3.bias.data.cpu().numpy()))
print("  np_w1.T vs model.fc1.weight:", np.allclose(np_w1.T, model.fc1.weight.data.cpu().numpy()))
print("  np_b1 vs model.fc1.bias:",     np.allclose(np_b1.reshape(-1), model.fc1.bias.data.cpu().numpy()))
print("  np_w1.T vs model.fc1.weight:", np.allclose(np_w2.T, model.fc2.weight.data.cpu().numpy()))
print("  np_b1 vs model.fc1.bias:",     np.allclose(np_b2.reshape(-1), model.fc2.bias.data.cpu().numpy()))

 Inferring...: 100%|██████████| 1/1 [00:05<00:00,  5.67s/it, average_times=t: 0.0027 s, s: 5.6619 s, f: 0.001 s, correct_predictions=100.0%]

Average forward execution time in seconds: 
PyTorch: 0.0027 s, 
Slow: 5.6619 s, 
Fast: 0.001 s
Verifica pesi conv1:
  np_k1 vs model.conv1.weight: True
  np_bc1 vs model.conv1.bias: True
  np_k1 vs model.conv1.weight: True
  np_bc1 vs model.conv1.bias: True
  np_k1 vs model.conv1.weight: True
  np_bc1 vs model.conv1.bias: True
  np_w1.T vs model.fc1.weight: True
  np_b1 vs model.fc1.bias: True
  np_w1.T vs model.fc1.weight: True
  np_b1 vs model.fc1.bias: True





## Training

### Test for Slow approach

In this panel the approach is tested to see if it learns or not. the test uses first just one image, then the first 100 for each eopch, in order to see if the loss descends during the training

#### Weights Initialization

In [None]:
k1 = np.random.rand(int(numpy_weights['k1'].flatten().shape[0])).reshape(numpy_weights['k1'].shape)
bc1 = np.random.rand(int(numpy_weights['b_conv1'].flatten().shape[0])).reshape(numpy_weights['b_conv1'].shape)
k2 = np.random.rand(int(numpy_weights['k2'].flatten().shape[0])).reshape(numpy_weights['k2'].shape)
bc2 = np.random.rand(int(numpy_weights['b_conv2'].flatten().shape[0])).reshape(numpy_weights['b_conv2'].shape)
k3 = np.random.rand(int(numpy_weights['k3'].flatten().shape[0])).reshape(numpy_weights['k3'].shape)
bc3 = np.random.rand(int(numpy_weights['b_conv3'].flatten().shape[0])).reshape(numpy_weights['b_conv3'].shape)
w1 = np.random.rand(int(numpy_weights['w1'].flatten().shape[0])).reshape(numpy_weights['w1'].shape)
b1 = np.random.rand(int(numpy_weights['b1'].flatten().shape[0])).reshape(numpy_weights['b1'].shape)
w2 = np.random.rand(int(numpy_weights['w2'].flatten().shape[0])).reshape(numpy_weights['w2'].shape)
b2 = np.random.rand(int(numpy_weights['b2'].flatten().shape[0])).reshape(numpy_weights['b2'].shape)

In [None]:
def avgList(listA):
    sum_li = sum(listA)
    length_li = len(listA)
    return round(sum_li/length_li,4)

#### Same Image

In [None]:
import matplotlib.pyplot as plt
ToBeTrained = True
if ToBeTrained:
    avg_loss = []
    forward_time = []
    backward_time = []
    numEpochs = 20
    bs = 1
    lr = 0.001
    loop = tqdm(range(numEpochs))
    for i in loop:
        c0 = train_images[0].reshape(1,1,28,28).astype(np.float32)
        
        # Forward
        sfts = time.time() # slow forward time start
        c1s,mask1s = Slow_ReLU_Conv(c0.astype(np.float32),k1,bc1,pad=0,stride=2)
        c2s,mask2s = Slow_ReLU_Conv(c1s.astype(np.float32),k2,bc2,pad=1,stride=2)
        c3s,mask3s = Slow_ReLU_Conv(c2s.astype(np.float32),k3,bc3,pad=0,stride=2)

        imlps = c3s.reshape(1,-1)
        fl,fa,sl,sa = ReLU_SoftMax_FullyConnected(imlps,w1,b1,w2,b2)
        sfte = time.time() # slow forward time end
        sft = sfte - sfts
        forward_time.append(sft)
        
        # Loss
        loss = crossEntropy(sa,train_labels[0])
        avg_loss.append(loss)

        # Backward
        sbts = time.time() # slow backward time start
        dL_i_mlp,dL_dw1,dL_db1,dL_dw2,dL_db2 = ReLU_SoftMax_FC_Backward(bs,sa,train_labels[0],w1,w2,fa,fl,imlps)
        dL_i_mlp = dL_i_mlp.reshape(c3s.shape)

        gi3,gk3,gb3 = Slow_ReLU_Gradient(c2s,dL_i_mlp,k3,mask3s,pad=0,stride=2)

        gi2,gk2,gb2 = Slow_ReLU_Gradient(c1s,gi3,k2,mask2s,pad=1,stride=2)
        gi1,gk1,gb1 = Slow_ReLU_Gradient(c0,gi2,k1,mask1s,pad=0,stride=2)
        sbte = time.time() # slow backward time end
        sbt = sbte - sbts
        backward_time.append(sbt)

        # Weights update
        w1 -= lr*dL_dw1
        b1 -= lr*dL_db1
        w2 -= lr*dL_dw2
        b2 -= lr*dL_db2
        k3 -= lr*gk3
        k2 -= lr*gk2
        k1 -= lr*gk1
        bc3 -= lr*gb3.reshape(-1)
        bc2 -= lr*gb2.reshape(-1)
        bc1 -= lr*gb1.reshape(-1)
        
        if len(avg_loss) >= 2:
            loop.set_postfix(pendence=f" {avg_loss[i]-avg_loss[i-1]}",avgForward=f"{avgList(forward_time)} s", avgBackward=f"{avgList(backward_time)} s" )

    plt.plot(avg_loss)
    plt.show()
# 2.64135 <-> 2.64095
# 2.64055 <-> 2.64020
# 2.64015 <-> 2.63980
# 2.63910 <-> 2.63840

These are the results for 20 epochs of one image:
- average forward time : 3.6265 s
- average backward time : 9.8262 s

Plot of the loss:

<img src="IMAGES\Slow Approach.png">


### PyTorch Comparison

#### Weights initialization

In [127]:
k1 = np.random.rand(int(numpy_weights['k1'].flatten().shape[0])).reshape(numpy_weights['k1'].shape)
bc1 = np.random.rand(int(numpy_weights['b_conv1'].flatten().shape[0])).reshape(numpy_weights['b_conv1'].shape)
k2 = np.random.rand(int(numpy_weights['k2'].flatten().shape[0])).reshape(numpy_weights['k2'].shape)
bc2 = np.random.rand(int(numpy_weights['b_conv2'].flatten().shape[0])).reshape(numpy_weights['b_conv2'].shape)
k3 = np.random.rand(int(numpy_weights['k3'].flatten().shape[0])).reshape(numpy_weights['k3'].shape)
bc3 = np.random.rand(int(numpy_weights['b_conv3'].flatten().shape[0])).reshape(numpy_weights['b_conv3'].shape)
w1 = np.random.rand(int(numpy_weights['w1'].flatten().shape[0])).reshape(numpy_weights['w1'].shape)
b1 = np.random.rand(int(numpy_weights['b1'].flatten().shape[0])).reshape(numpy_weights['b1'].shape)
w2 = np.random.rand(int(numpy_weights['w2'].flatten().shape[0])).reshape(numpy_weights['w2'].shape)
b2 = np.random.rand(int(numpy_weights['b2'].flatten().shape[0])).reshape(numpy_weights['b2'].shape)
p_k1 = np.copy(k1)
p_bc1 = np.copy(bc1)
p_k2 = np.copy(k2)
p_bc2 = np.copy(bc2)
p_k3 = np.copy(k3)
p_bc3 = np.copy(bc3)
p_w1 = np.copy(w1)
p_b1 = np.copy(b1)
p_w2 = np.copy(w2)
p_b2 = np.copy(b2)
modelComp = SimpleCNN()
with torch.no_grad():
    modelComp.conv1.weight.copy_(torch.from_numpy(p_k1))
    modelComp.conv1.bias.copy_(torch.from_numpy(p_bc1))
    modelComp.conv2.weight.copy_(torch.from_numpy(p_k2))
    modelComp.conv2.bias.copy_(torch.from_numpy(p_bc2))
    modelComp.conv3.weight.copy_(torch.from_numpy(p_k3))
    modelComp.conv3.bias.copy_(torch.from_numpy(p_bc3))
    modelComp.fc1.weight.copy_(torch.from_numpy(p_w1.T))
    modelComp.fc1.bias.copy_(torch.from_numpy(p_b1).reshape(-1))
    modelComp.fc2.weight.copy_(torch.from_numpy(p_w2.T))
    modelComp.fc2.bias.copy_(torch.from_numpy(p_b2).reshape(-1))

print("Verifica pesi conv1:")
print("  p_k1 vs modelComp.conv1.weight:", np.allclose(k1, modelComp.conv1.weight.data.cpu().numpy()))
print("  p_bc1 vs modelComp.conv1.bias:", np.allclose(bc1, modelComp.conv1.bias.data.cpu().numpy()))
print("Verifica pesi conv2:")
print("  p_k1 vs modelComp.conv1.weight:", np.allclose(k2, modelComp.conv2.weight.data.cpu().numpy()))
print("  p_bc1 vs modelComp.conv1.bias:", np.allclose(bc2, modelComp.conv2.bias.data.cpu().numpy()))
print("Verifica pesi conv3:")
print("  p_k1 vs modelComp.conv1.weight:", np.allclose(k3, modelComp.conv3.weight.data.cpu().numpy()))
print("  p_bc1 vs modelComp.conv1.bias:", np.allclose(bc3, modelComp.conv3.bias.data.cpu().numpy()))
print("Verifica pesi fc1:")
print("  p_w1.T vs modelComp.fc1.weight:", np.allclose(w1.T, modelComp.fc1.weight.data.cpu().numpy()))
print("  p_b1 vs modelComp.fc1.bias:", np.allclose(b1.reshape(-1), modelComp.fc1.bias.data.cpu().numpy()))
print("Verifica pesi fc2:")
print("  p_w1.T vs modelComp.fc1.weight:", np.allclose(w2.T, modelComp.fc2.weight.data.cpu().numpy()))
print("  p_b1 vs modelComp.fc1.bias:", np.allclose(b2.reshape(-1), modelComp.fc2.bias.data.cpu().numpy()))


Verifica pesi conv1:
  p_k1 vs modelComp.conv1.weight: True
  p_bc1 vs modelComp.conv1.bias: True
Verifica pesi conv2:
  p_k1 vs modelComp.conv1.weight: True
  p_bc1 vs modelComp.conv1.bias: True
Verifica pesi conv3:
  p_k1 vs modelComp.conv1.weight: True
  p_bc1 vs modelComp.conv1.bias: True
Verifica pesi fc1:
  p_w1.T vs modelComp.fc1.weight: True
  p_b1 vs modelComp.fc1.bias: True
Verifica pesi fc2:
  p_w1.T vs modelComp.fc1.weight: True
  p_b1 vs modelComp.fc1.bias: True


In [192]:
def softmax(x):
    e_x = np.exp(x-np.max(x,axis=-1,keepdims=True))  # for numerical stability
    return e_x / np.sum(e_x,axis=-1,keepdims=True)

def ReLU_SoftMax_FullyConnected(input_array,w1,b1,w2,b2):
    fl = (input_array @ w1)+b1 # first layer
    fa = np.maximum(0,fl) # first activation: ReLU
    sl = (fa @ w2)+b2 # second layer
    sa = softmax(sl) # second activation: SoftMax
    return fl,fa,sl,sa

def ReLU_SoftMax_FC_Backward(bs,pred,labels,w1,w2,fa,fl,i_mlp):
    dL_dz2 = pred-labels[0:bs]
    dL_dw2 = fa.T @ dL_dz2
    dL_db2 = np.sum(dL_dz2, axis=0)
    dL_dfa = dL_dz2 @ w2.T
    dReLU = (fl > 0).astype(float)
    dL_dfl = dL_dfa * dReLU
    dL_dw1 = i_mlp.reshape(bs, -1).T @ dL_dfl
    dL_db1 = np.sum(dL_dfl, axis=0)
    dL_i_mlp = dL_dfl @ w1.T
    return dL_i_mlp,dL_dw1,dL_db1,dL_dw2,dL_db2

def log_softmax_numpy(x):
    # x sono i logit, shape (N, C)
    max_x = np.max(x, axis=-1, keepdims=True)
    stable_x = x - max_x
    # Aggiungi epsilon a np.sum per evitare log(0) se tutti gli exp sono 0 (improbabile con max sottratto)
    log_sum_exp = np.log(np.sum(np.exp(stable_x), axis=-1, keepdims=True) + 1e-9) 
    return stable_x - log_sum_exp

def nll_loss_numpy(log_probs, true_class_indices_flat):
    # log_probs: output di log_softmax_numpy, shape (N, C)
    # true_class_indices_flat: indici delle classi vere, shape (N,) es. np.array([idx1, idx2...])
    N = log_probs.shape[0]
    # Assicurati che true_class_indices_flat sia un array di interi per l'indicizzazione
    return -log_probs[np.arange(N), true_class_indices_flat.astype(int)].mean()

k1  = (np.random.rand(int(numpy_weights['k1'].flatten().shape[0])).reshape(numpy_weights['k1'].shape)-0.5).astype(np.float32)
bc1 = (np.random.rand(int(numpy_weights['b_conv1'].flatten().shape[0])).reshape(numpy_weights['b_conv1'].shape)-0.5).astype(np.float32)
k2  = (np.random.rand(int(numpy_weights['k2'].flatten().shape[0])).reshape(numpy_weights['k2'].shape)-0.5).astype(np.float32)
bc2 = (np.random.rand(int(numpy_weights['b_conv2'].flatten().shape[0])).reshape(numpy_weights['b_conv2'].shape)-0.5).astype(np.float32)
k3  = (np.random.rand(int(numpy_weights['k3'].flatten().shape[0])).reshape(numpy_weights['k3'].shape)-0.5).astype(np.float32)
bc3 = (np.random.rand(int(numpy_weights['b_conv3'].flatten().shape[0])).reshape(numpy_weights['b_conv3'].shape)-0.5).astype(np.float32)
w1  = (np.random.rand(int(numpy_weights['w1'].flatten().shape[0])).reshape(numpy_weights['w1'].shape)-0.5).astype(np.float32)
b1  = (np.random.rand(int(numpy_weights['b1'].flatten().shape[0])).reshape(numpy_weights['b1'].shape)-0.5).astype(np.float32)
w2  = (np.random.rand(int(numpy_weights['w2'].flatten().shape[0])).reshape(numpy_weights['w2'].shape)-0.5).astype(np.float32)
b2  = (np.random.rand(int(numpy_weights['b2'].flatten().shape[0])).reshape(numpy_weights['b2'].shape)-0.5).astype(np.float32)
p_k1 = np.copy(k1)
p_bc1 = np.copy(bc1)
p_k2 = np.copy(k2)
p_bc2 = np.copy(bc2)
p_k3 = np.copy(k3)
p_bc3 = np.copy(bc3)
p_w1 = np.copy(w1)
p_b1 = np.copy(b1)
p_w2 = np.copy(w2)
p_b2 = np.copy(b2)
modelComp = SimpleCNN()
modelComp = modelComp.float()
with torch.no_grad():
    modelComp.conv1.weight.copy_(torch.from_numpy(p_k1))
    modelComp.conv1.bias.copy_(torch.from_numpy(p_bc1))
    modelComp.conv2.weight.copy_(torch.from_numpy(p_k2))
    modelComp.conv2.bias.copy_(torch.from_numpy(p_bc2))
    modelComp.conv3.weight.copy_(torch.from_numpy(p_k3))
    modelComp.conv3.bias.copy_(torch.from_numpy(p_bc3))
    modelComp.fc1.weight.copy_(torch.from_numpy(p_w1.T))
    modelComp.fc1.bias.copy_(torch.from_numpy(p_b1).reshape(-1))
    modelComp.fc2.weight.copy_(torch.from_numpy(p_w2.T))
    modelComp.fc2.bias.copy_(torch.from_numpy(p_b2).reshape(-1))

# Loss definition
criterion = nn.CrossEntropyLoss() 

# Optimisation definition
lr = learning_rate = 0.001
# SOSTITUISCI ADAM CON SGD
optimizer = torch.optim.SGD(
    modelComp.parameters(),
    lr=learning_rate,
    momentum=0,
    dampening=0,
    weight_decay=0,
    nesterov=False
)

n_epochs = 2

for i in range(n_epochs):

    c0 = (train_images[0].reshape(1,1,28,28)/255).astype(np.float32)
    l0 = train_labels[0]
    torch_c0 = torch.from_numpy(c0).float()
    torch_l0 = torch.from_numpy(np.array([np.argmax(l0)])).long()
    #########################################################################
    ########################### ColFast #####################################
    #########################################################################
        
    c1f,mask1f = Fast_ReLU_Conv(c0.astype(np.float32),k1,bc1,pad=0,stride=2)
    c2f,mask2f = Fast_ReLU_Conv(c1f.astype(np.float32),k2,bc2,pad=1,stride=2)
    c3f,mask3f = Fast_ReLU_Conv(c2f.astype(np.float32),k3,bc3,pad=0,stride=2)

    imlpf = c3f.reshape(1,-1)
    fl_f,fa_f,sl_f,sa_f = ReLU_SoftMax_FullyConnected(imlpf,w1,b1,w2,b2)
    print(sl_f)
    # Loss
    lossCF = crossEntropy(sa_f,l0)

    #########################################################################
    ########################### ColSlow #####################################
    #########################################################################

    c1s,mask1s = Slow_ReLU_Conv(c0.astype(np.float32),k1,bc1,pad=0,stride=2)
    c2s,mask2s = Slow_ReLU_Conv(c1s.astype(np.float32),k2,bc2,pad=1,stride=2)
    c3s,mask3s = Slow_ReLU_Conv(c2s.astype(np.float32),k3,bc3,pad=0,stride=2)

    imlps = c3s.reshape(1,-1)
    fl_s,fa_s,sl_s,sa_s = ReLU_SoftMax_FullyConnected(imlps,w1,b1,w2,b2)
    print(sl_f)
    # Loss
    lossCS = crossEntropy(sa_s,l0)

    #########################################################################
    ########################### PyTorch #####################################
    #########################################################################

    # make all gradients zero to avoid learning on gradients of previous steps
    optimizer.zero_grad()
    
    # Forward pass
    outputs = modelComp(torch_c0)
    print(outputs)

    print("Verifica pesi conv1:")
    print("  p_k1 vs modelComp.conv1.weight:", np.allclose(k1, modelComp.conv1.weight.data.cpu().numpy()))
    print("  p_bc1 vs modelComp.conv1.bias:", np.allclose(bc1, modelComp.conv1.bias.data.cpu().numpy()))
    print("Verifica pesi conv2:")
    print("  p_k1 vs modelComp.conv1.weight:", np.allclose(k2, modelComp.conv2.weight.data.cpu().numpy()))
    print("  p_bc1 vs modelComp.conv1.bias:", np.allclose(bc2, modelComp.conv2.bias.data.cpu().numpy()))
    print("Verifica pesi conv3:")
    print("  p_k1 vs modelComp.conv1.weight:", np.allclose(k3, modelComp.conv3.weight.data.cpu().numpy()))
    print("  p_bc1 vs modelComp.conv1.bias:", np.allclose(bc3, modelComp.conv3.bias.data.cpu().numpy()))
    print("Verifica pesi fc1:")
    print("  p_w1.T vs modelComp.fc1.weight:", np.allclose(w1.T, modelComp.fc1.weight.data.cpu().numpy()))
    print("  p_b1 vs modelComp.fc1.bias:", np.allclose(b1.reshape(-1), modelComp.fc1.bias.data.cpu().numpy()))
    print("Verifica pesi fc2:")
    print("  p_w1.T vs modelComp.fc1.weight:", np.allclose(w2.T, modelComp.fc2.weight.data.cpu().numpy()))
    print("  p_b1 vs modelComp.fc1.bias:", np.allclose(b2.reshape(-1), modelComp.fc2.bias.data.cpu().numpy()))

    l0_idx_flat = np.array([np.argmax(l0)]) # Shape (1,)

    # Per ColFast (dopo aver calcolato sl_f)
    log_probs_f = log_softmax_numpy(sl_f)
    lossCF = nll_loss_numpy(log_probs_f, l0_idx_flat)
    # print(f"Loss ColFast (NLL): {lossCF}") # Debug

    # Per ColSlow (dopo aver calcolato sl_s)
    log_probs_s = log_softmax_numpy(sl_s)
    lossCS = nll_loss_numpy(log_probs_s, l0_idx_flat)

    loss = criterion(outputs, torch_l0)

    print(f"Losses: p:{loss} cf:{lossCF} cs:{lossCS}")

    loss.backward()
    optimizer.step()

    dL_i_mlp,dL_dw1,dL_db1,dL_dw2,dL_db2 = ReLU_SoftMax_FC_Backward(1,sa_s,train_labels[0],w1,w2,fa_s,fl_s,imlps)
    dL_i_mlp = dL_i_mlp.reshape(c3s.shape)

    gi3,gk3,gb3 = Slow_ReLU_Gradient(c2s,dL_i_mlp,k3,mask3s,pad_fwd=0,stride_fwd=2)

    gi2,gk2,gb2 = Slow_ReLU_Gradient(c1s,gi3,k2,mask2s,pad_fwd=1,stride_fwd=2)
    gi1,gk1,gb1 = Slow_ReLU_Gradient(c0,gi2,k1,mask1s,pad_fwd=0,stride_fwd=2)


    print("\n--- CONFRONTO GRADIENTI PRIMA EPOCA (dopo backward, prima di update) ---")

    # Layer FC2
    grad_w2_torch = modelComp.fc2.weight.grad.data.cpu().numpy()
    grad_b2_torch = modelComp.fc2.bias.grad.data.cpu().numpy()
    print(f"FC2 w2 close: {np.allclose(grad_w2_torch, dL_dw2.T, atol=1e-5)}") # dL_dw2 è (F_mid, C_out), PyTorch è (C_out, F_mid)
    print(f"FC2 b2 close: {np.allclose(grad_b2_torch, dL_db2.reshape(-1), atol=1e-5)}")
    if not np.allclose(grad_w2_torch, dL_dw2.T, atol=1e-5):
        print("PyTorch fc2.weight.grad (sample):\n", grad_w2_torch)
        print("NumPy dL_dw2.T (sample):\n", dL_dw2.T)

    # Layer FC1
    grad_w1_torch = modelComp.fc1.weight.grad.data.cpu().numpy()
    grad_b1_torch = modelComp.fc1.bias.grad.data.cpu().numpy()
    print(f"FC1 w1 close: {np.allclose(grad_w1_torch, dL_dw1.T, atol=1e-5)}")
    print(f"FC1 b1 close: {np.allclose(grad_b1_torch, dL_db1.reshape(-1), atol=1e-5)}")
    if not np.allclose(grad_w1_torch, dL_dw1.T, atol=1e-5):
        print("PyTorch fc1.weight.grad (sample):\n", grad_w1_torch)
        print("NumPy dL_dw1.T (sample):\n", dL_dw1.T)

    # Layer Conv3
    grad_k3_torch = modelComp.conv3.weight.grad.data.cpu().numpy()
    grad_bc3_torch = modelComp.conv3.bias.grad.data.cpu().numpy()
    print(f"Conv3 k3 close: {np.allclose(grad_k3_torch, gk3, atol=1e-5)}")
    print(f"Conv3 bc3 close: {np.allclose(grad_bc3_torch, gb3.reshape(-1), atol=1e-5)}")
    if not np.allclose(grad_k3_torch, gk3, atol=1e-5):
        print("PyTorch conv3.weight.grad (sample):\n", grad_k3_torch)
        print("NumPy gk3 (sample):\n", gk3)

    # Layer Conv2
    grad_k2_torch = modelComp.conv2.weight.grad.data.cpu().numpy()
    grad_bc2_torch = modelComp.conv2.bias.grad.data.cpu().numpy()
    print(f"Conv2 k2 close: {np.allclose(grad_k2_torch, gk2, atol=1e-5)}")
    print(f"Conv2 bc2 close: {np.allclose(grad_bc2_torch, gb2.reshape(-1), atol=1e-5)}")

    # Layer Conv1
    grad_k1_torch = modelComp.conv1.weight.grad.data.cpu().numpy()
    grad_bc1_torch = modelComp.conv1.bias.grad.data.cpu().numpy()
    print(f"Conv1 k1 close: {np.allclose(grad_k1_torch, gk1, atol=1e-5)}")
    print(f"Conv1 bc1 close: {np.allclose(grad_bc1_torch, gb1.reshape(-1), atol=1e-5)}")

    # Weights update
    w1 -= lr*dL_dw1
    b1 -= lr*dL_db1
    w2 -= lr*dL_dw2
    b2 -= lr*dL_db2
    k3 -= lr*gk3
    k2 -= lr*gk2
    k1 -= lr*gk1
    bc3 -= lr*gb3.reshape(-1)
    bc2 -= lr*gb2.reshape(-1)
    bc1 -= lr*gb1.reshape(-1)

[[ -32.344532   -24.180666   242.01834      3.14395    -97.3935
    -1.7314793 -150.55801    174.62186     47.483128   -12.162725 ]]
[[ -32.344532   -24.180666   242.01834      3.14395    -97.3935
    -1.7314793 -150.55801    174.62186     47.483128   -12.162725 ]]
tensor([[ -32.3445,  -24.1806,  242.0184,    3.1440,  -97.3935,   -1.7315,
         -150.5579,  174.6218,   47.4831,  -12.1627]],
       grad_fn=<AddmmBackward0>)
Verifica pesi conv1:
  p_k1 vs modelComp.conv1.weight: True
  p_bc1 vs modelComp.conv1.bias: True
Verifica pesi conv2:
  p_k1 vs modelComp.conv1.weight: True
  p_bc1 vs modelComp.conv1.bias: True
Verifica pesi conv3:
  p_k1 vs modelComp.conv1.weight: True
  p_bc1 vs modelComp.conv1.bias: True
Verifica pesi fc1:
  p_w1.T vs modelComp.fc1.weight: True
  p_b1 vs modelComp.fc1.bias: True
Verifica pesi fc2:
  p_w1.T vs modelComp.fc1.weight: True
  p_b1 vs modelComp.fc1.bias: True
Losses: p:243.74984741210938 cf:243.74981689453125 cs:243.7498321533203

--- CONFRONTO GRAD

### Test for Fast approach

In this panel the approach is tested to see if it learns or not. the test uses first just one image, then the first 100 for each eopch, in order to see if the loss descends during the training

#### Weights Initialization

In [None]:
k1 = np.random.rand(int(numpy_weights['k1'].flatten().shape[0])).reshape(numpy_weights['k1'].shape)
bc1 = np.random.rand(int(numpy_weights['b_conv1'].flatten().shape[0])).reshape(numpy_weights['b_conv1'].shape)
k2 = np.random.rand(int(numpy_weights['k2'].flatten().shape[0])).reshape(numpy_weights['k2'].shape)
bc2 = np.random.rand(int(numpy_weights['b_conv2'].flatten().shape[0])).reshape(numpy_weights['b_conv2'].shape)
k3 = np.random.rand(int(numpy_weights['k3'].flatten().shape[0])).reshape(numpy_weights['k3'].shape)
bc3 = np.random.rand(int(numpy_weights['b_conv3'].flatten().shape[0])).reshape(numpy_weights['b_conv3'].shape)
w1 = np.random.rand(int(numpy_weights['w1'].flatten().shape[0])).reshape(numpy_weights['w1'].shape)
b1 = np.random.rand(int(numpy_weights['b1'].flatten().shape[0])).reshape(numpy_weights['b1'].shape)
w2 = np.random.rand(int(numpy_weights['w2'].flatten().shape[0])).reshape(numpy_weights['w2'].shape)
b2 = np.random.rand(int(numpy_weights['b2'].flatten().shape[0])).reshape(numpy_weights['b2'].shape)

In [None]:
def avgList(listA):
    sum_li = sum(listA)
    length_li = len(listA)
    return round(sum_li/length_li,4)

#### Same Image

In [None]:
import matplotlib.pyplot as plt
avg_loss = []
forward_time = []
backward_time = []
numEpochs = 20
bs = 1
lr = 0.001
loop = tqdm(range(numEpochs))
for i in loop:
    c0 = train_images[0].reshape(1,1,28,28).astype(np.float32)
    
    # Forward
    sfts = time.time() # slow forward time start
    c1s,mask1s = Fast_ReLU_Conv(c0.astype(np.float32),k1,bc1,pad=0,stride=2)
    c2s,mask2s = Fast_ReLU_Conv(c1s.astype(np.float32),k2,bc2,pad=1,stride=2)
    c3s,mask3s = Fast_ReLU_Conv(c2s.astype(np.float32),k3,bc3,pad=0,stride=2)
    imlps = c3s.reshape(1,-1)
    fl,fa,sl,sa = ReLU_SoftMax_FullyConnected(imlps,w1,b1,w2,b2)
    sfte = time.time() # slow forward time end
    sft = sfte - sfts
    forward_time.append(sft)
    
    # Loss
    loss = crossEntropy(sa,train_labels[0])
    avg_loss.append(loss)

    # Backward
    sbts = time.time() # slow backward time start
    dL_i_mlp,dL_dw1,dL_db1,dL_dw2,dL_db2 = ReLU_SoftMax_FC_Backward(bs,sa,train_labels[0],w1,w2,fa,fl,imlps)
    dL_i_mlp = dL_i_mlp.reshape(c3s.shape)

    gi3,gk3,gb3 = Fast_ReLU_Gradient(c2s,dL_i_mlp,k3,mask3s,pad=0,stride=2)
    gi2,gk2,gb2 = Fast_ReLU_Gradient(c1s,gi3,k2,mask2s,pad=1,stride=2)
    gi1,gk1,gb1 = Fast_ReLU_Gradient(c0,gi2,k1,mask1s,pad=0,stride=2)
    sbte = time.time() # slow backward time end
    sbt = sbte - sbts
    backward_time.append(sbt)

    # Weights update
    w1 -= lr*dL_dw1
    b1 -= lr*dL_db1
    w2 -= lr*dL_dw2
    b2 -= lr*dL_db2
    k3 -= lr*gk3
    k2 -= lr*gk2
    k1 -= lr*gk1
    bc3 -= lr*gb3
    bc2 -= lr*gb2
    bc1 -= lr*gb1
    
    if len(avg_loss) > 2:
        loop.set_postfix(pendence=f" {avg_loss[i]-avg_loss[i-1]}",avgForward=f"{avgList(forward_time)} s", avgBackward=f"{avgList(backward_time)} s" )

plt.plot(avg_loss)
plt.show()

These are the results for 20 epochs of one image:
- average forward time : 0.0022 s
- average backward time : 0.0097 s

Plot of the loss:

<img src="IMAGES\Fast Approach.png">


# TEST

In [24]:
import numpy as np
import torch
import torch.nn.functional as F

# --- Parametri del Test ---
N = 1  # Batch size
C_in = 1 # Canali di input
H_in = 3 # Altezza input
W_in = 3 # Larghezza input

C_out = 1 # Canali di output
K_h = 2  # Altezza Kernel
K_w = 2  # Larghezza Kernel

stride = 1
padding = 0

# --- Dati di Input (NumPy) ---
np.random.seed(0) # Per riproducibilità
img_np = np.random.randn(N, C_in, H_in, W_in).astype(np.float32)
# img_np = np.arange(1, N*C_in*H_in*W_in+1).reshape(N, C_in, H_in, W_in).astype(np.float32) / 10.0
kernel_np = np.random.randn(C_out, C_in, K_h, K_w).astype(np.float32)
# kernel_np = np.arange(1, C_out*C_in*K_h*K_w+1).reshape(C_out, C_in, K_h, K_w).astype(np.float32) / 5.0
bias_np = np.random.randn(C_out).astype(np.float32)
# bias_np = np.array([0.5], dtype=np.float32)


# --- Calcolo Forward Pass (NumPy) ---
# 1. Convoluzione
out_conv_h_np = (H_in - K_h + 2 * padding) // stride + 1
out_conv_w_np = (W_in - K_w + 2 * padding) // stride + 1
out_conv_np = np.zeros((N, C_out, out_conv_h_np, out_conv_w_np), dtype=np.float32)

img_padded_np = np.pad(img_np, ((0,0), (0,0), (padding,padding), (padding,padding)), mode='constant')

for n in range(N):
    for c_o in range(C_out):
        for r_o in range(out_conv_h_np):
            for c_o_conv in range(out_conv_w_np):
                val = bias_np[c_o]
                for c_i in range(C_in):
                    for r_k in range(K_h):
                        for c_k in range(K_w):
                            img_r = r_o * stride + r_k
                            img_c = c_o_conv * stride + c_k
                            val += img_padded_np[n, c_i, img_r, img_c] * kernel_np[c_o, c_i, r_k, c_k]
                out_conv_np[n, c_o, r_o, c_o_conv] = val

# 2. ReLU
out_relu_np = np.maximum(0, out_conv_np)
mask_relu_np = (out_conv_np > 0).astype(np.float32)


# --- Gradiente dell'Output (NumPy) - Simula il gradiente dal layer successivo ---
# d_out_relu_np è il "d_out_values" per la tua funzione
d_out_relu_np = np.random.randn(N, C_out, out_conv_h_np, out_conv_w_np).astype(np.float32)
# d_out_relu_np = np.ones_like(out_relu_np).astype(np.float32) * 0.1


# --- Calcolo Backward Pass (Tua Implementazione NumPy) ---
grad_input_np, grad_kernel_np, grad_bias_np = Slow_ReLU_Gradient(
    img_np, d_out_relu_np, kernel_np, mask_relu_np, pad_fwd=padding, stride_fwd=stride
)

print("--- Risultati NumPy ---")
print("grad_input_np:\n", grad_input_np)
print("grad_kernel_np:\n", grad_kernel_np)
print("grad_bias_np:\n", grad_bias_np)


# --- Calcolo con PyTorch per Confronto ---
img_torch = torch.tensor(img_np, requires_grad=True)
kernel_torch = torch.tensor(kernel_np, requires_grad=True)
bias_torch = torch.tensor(bias_np, requires_grad=True)

# Forward pass PyTorch
out_conv_torch = F.conv2d(img_torch, kernel_torch, bias_torch, stride=stride, padding=padding)
out_relu_torch = F.relu(out_conv_torch)

# Backward pass PyTorch
# Passiamo lo stesso gradiente dell'output che abbiamo usato per NumPy
d_out_relu_torch = torch.tensor(d_out_relu_np)
out_relu_torch.backward(gradient=d_out_relu_torch)

print("\n--- Risultati PyTorch ---")
print("img_torch.grad:\n", img_torch.grad.data.numpy())
print("kernel_torch.grad:\n", kernel_torch.grad.data.numpy())
print("bias_torch.grad:\n", bias_torch.grad.data.numpy())


# --- Confronto ---
print("\n--- Confronto dei Risultati ---")
print("Confronto grad_input:", np.allclose(grad_input_np, img_torch.grad.data.numpy(), atol=1e-5))
print("Confronto grad_kernel:", np.allclose(grad_kernel_np, kernel_torch.grad.data.numpy(), atol=1e-5))
print("Confronto grad_bias:", np.allclose(grad_bias_np, bias_torch.grad.data.numpy(), atol=1e-5))

print("\n--- Verifica Dimensioni Output del Forward ---")
print("NumPy out_relu_np shape:", out_relu_np.shape)
print("PyTorch out_relu_torch shape:", out_relu_torch.shape)
assert out_relu_np.shape == out_relu_torch.shape, "Le dimensioni dell'output del forward non corrispondono!"

print("\n--- Verifica Valori Output del Forward ---")
print("Forward outputs close:", np.allclose(out_relu_np, out_relu_torch.detach().numpy(), atol=1e-5))
if not np.allclose(out_relu_np, out_relu_torch.detach().numpy(), atol=1e-5):
    print("NumPy out_relu_np:\n", out_relu_np)
    print("PyTorch out_relu_torch:\n", out_relu_torch.detach().numpy())


# Test con STRIDE = 2, PADDING = 1
print("\n\n--- TEST CON STRIDE=2, PADDING=1 ---")
stride = 2
padding = 1

# --- Dati di Input (NumPy) ---
img_np_s2p1 = np.random.randn(N, C_in, H_in+1, W_in+1).astype(np.float32) # Immagine leggermente più grande
kernel_np_s2p1 = np.random.randn(C_out, C_in, K_h, K_w).astype(np.float32)
bias_np_s2p1 = np.random.randn(C_out).astype(np.float32)

# --- Calcolo Forward Pass (NumPy) ---
out_conv_h_np_s2p1 = (img_np_s2p1.shape[2] - K_h + 2 * padding) // stride + 1
out_conv_w_np_s2p1 = (img_np_s2p1.shape[3] - K_w + 2 * padding) // stride + 1
out_conv_np_s2p1 = np.zeros((N, C_out, out_conv_h_np_s2p1, out_conv_w_np_s2p1), dtype=np.float32)
img_padded_np_s2p1 = np.pad(img_np_s2p1, ((0,0), (0,0), (padding,padding), (padding,padding)), mode='constant')
for n in range(N):
    for c_o in range(C_out):
        for r_o in range(out_conv_h_np_s2p1):
            for c_o_conv in range(out_conv_w_np_s2p1):
                val = bias_np_s2p1[c_o]
                for c_i in range(C_in):
                    for r_k in range(K_h):
                        for c_k in range(K_w):
                            img_r = r_o * stride + r_k
                            img_c = c_o_conv * stride + c_k
                            val += img_padded_np_s2p1[n, c_i, img_r, img_c] * kernel_np_s2p1[c_o, c_i, r_k, c_k]
                out_conv_np_s2p1[n, c_o, r_o, c_o_conv] = val
out_relu_np_s2p1 = np.maximum(0, out_conv_np_s2p1)
mask_relu_np_s2p1 = (out_conv_np_s2p1 > 0).astype(np.float32)
d_out_relu_np_s2p1 = np.random.randn(N, C_out, out_conv_h_np_s2p1, out_conv_w_np_s2p1).astype(np.float32)

# --- Calcolo Backward Pass (Tua Implementazione NumPy) ---
grad_input_np_s2p1, grad_kernel_np_s2p1, grad_bias_np_s2p1 = Slow_ReLU_Gradient(
    img_np_s2p1, d_out_relu_np_s2p1, kernel_np_s2p1, mask_relu_np_s2p1, pad_fwd=padding, stride_fwd=stride
)

# --- Calcolo con PyTorch per Confronto ---
img_torch_s2p1 = torch.tensor(img_np_s2p1, requires_grad=True)
kernel_torch_s2p1 = torch.tensor(kernel_np_s2p1, requires_grad=True)
bias_torch_s2p1 = torch.tensor(bias_np_s2p1, requires_grad=True)
out_conv_torch_s2p1 = F.conv2d(img_torch_s2p1, kernel_torch_s2p1, bias_torch_s2p1, stride=stride, padding=padding)
out_relu_torch_s2p1 = F.relu(out_conv_torch_s2p1)
d_out_relu_torch_s2p1 = torch.tensor(d_out_relu_np_s2p1)
out_relu_torch_s2p1.backward(gradient=d_out_relu_torch_s2p1)

print("--- Confronto dei Risultati (Stride=2, Padding=1) ---")
print("Confronto grad_input (s2p1):", np.allclose(grad_input_np_s2p1, img_torch_s2p1.grad.data.numpy(), atol=1e-5))
print("Confronto grad_kernel (s2p1):", np.allclose(grad_kernel_np_s2p1, kernel_torch_s2p1.grad.data.numpy(), atol=1e-5))
print("Confronto grad_bias (s2p1):", np.allclose(grad_bias_np_s2p1, bias_torch_s2p1.grad.data.numpy(), atol=1e-5))

if not np.allclose(grad_input_np_s2p1, img_torch_s2p1.grad.data.numpy(), atol=1e-5):
    print("NumPy grad_input_s2p1:\n", grad_input_np_s2p1)
    print("PyTorch grad_input_s2p1:\n", img_torch_s2p1.grad.data.numpy())
if not np.allclose(grad_kernel_np_s2p1, kernel_torch_s2p1.grad.data.numpy(), atol=1e-5):
    print("NumPy grad_kernel_s2p1:\n", grad_kernel_np_s2p1)
    print("PyTorch grad_kernel_s2p1:\n", kernel_torch_s2p1.grad.data.numpy())
if not np.allclose(grad_bias_np_s2p1, bias_torch_s2p1.grad.data.numpy(), atol=1e-5):
    print("NumPy grad_bias_s2p1:\n", grad_bias_np_s2p1)
    print("PyTorch grad_bias_s2p1:\n", bias_torch_s2p1.grad.data.numpy())

--- Risultati NumPy ---
grad_input_np:
 [[[[ 0.18224959  0.20094183  0.04806364]
   [ 1.2589653   0.9540252   0.22438703]
   [ 2.1727996   0.83869433 -0.15613317]]]]
grad_kernel_np:
 [[[[3.8814468  3.4949708 ]
   [3.0683658  0.29788435]]]]
grad_bias_np:
 [2.0664585]

--- Risultati PyTorch ---
img_torch.grad:
 [[[[ 0.18224959  0.20094183  0.04806364]
   [ 1.2589653   0.95402527  0.22438703]
   [ 2.1727996   0.83869433 -0.15613317]]]]
kernel_torch.grad:
 [[[[3.8814468  3.4949708 ]
   [3.0683656  0.29788435]]]]
bias_torch.grad:
 [2.0664585]

--- Confronto dei Risultati ---
Confronto grad_input: True
Confronto grad_kernel: True
Confronto grad_bias: True

--- Verifica Dimensioni Output del Forward ---
NumPy out_relu_np shape: (1, 1, 2, 2)
PyTorch out_relu_torch shape: torch.Size([1, 1, 2, 2])

--- Verifica Valori Output del Forward ---
Forward outputs close: True


--- TEST CON STRIDE=2, PADDING=1 ---
--- Confronto dei Risultati (Stride=2, Padding=1) ---
Confronto grad_input (s2p1): True
Co