##### 【Problem 1 】 Creation of 2D convolution layer

In [2]:
import numpy as np

class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size  # (height, width)
        self.stride = stride
        self.padding = padding
        self.W = np.random.randn(kernel_size[0], kernel_size[1], in_channels, out_channels)
        self.b = np.zeros(out_channels)

    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        FH, FW = self.kernel_size
        out_h = (H + 2*self.padding - FH) // self.stride + 1
        out_w = (W + 2*self.padding - FW) // self.stride + 1

        self.out_h, self.out_w = out_h, out_w

        if self.padding > 0:
            x = np.pad(x, ((0,0), (0,0), (self.padding,self.padding), (self.padding,self.padding)), 'constant')

        out = np.zeros((N, self.out_channels, out_h, out_w))

        for i in range(out_h):
            for j in range(out_w):
                h_start = i * self.stride
                w_start = j * self.stride
                region = x[:, :, h_start:h_start+FH, w_start:w_start+FW]
                for m in range(self.out_channels):
                    out[:, m, i, j] = np.sum(region * self.W[:, :, :, m], axis=(1, 2, 3)) + self.b[m]
        return out

    def backward(self, dout):
        N, C, H, W = self.x.shape
        FH, FW = self.kernel_size

        dx = np.zeros_like(self.x)
        dW = np.zeros_like(self.W)
        db = np.zeros_like(self.b)

        for i in range(self.out_h):
            for j in range(self.out_w):
                h_start = i * self.stride
                w_start = j * self.stride
                region = self.x[:, :, h_start:h_start+FH, w_start:w_start+FW]
                for m in range(self.out_channels):
                    db[m] += np.sum(dout[:, m, i, j])
                    for n in range(N):
                        dW[:, :, :, m] += region[n] * dout[n, m, i, j]
                        dx[n, :, h_start:h_start+FH, w_start:w_start+FW] += self.W[:, :, :, m] * dout[n, m, i, j]

        self.dW = dW
        self.db = db
        return dx


##### Problem 2 】 Experiment of 2D convolution layer in small array

In [3]:
# Input and weights
x = np.array([[[[ 1,  2,  3,  4],
                [ 5,  6,  7,  8],
                [ 9, 10, 11, 12],
                [13, 14, 15, 16]]]])  # shape (1,1,4,4)

w = np.array([[[[ 0.,  0.,  0.],
                [ 0.,  1.,  0.],
                [ 0., -1.,  0.]]],

              [[[ 0.,  0.,  0.],
                [ 0., -1.,  1.],
                [ 0.,  0.,  0.]]]])  # shape (2,1,3,3) NHWC → NCHW compatible

# Manually set weights to match shape
conv = Conv2d(in_channels=1, out_channels=2, kernel_size=(3, 3))
conv.W = w.transpose(2, 3, 1, 0)  # convert from (2,1,3,3) to (3,3,1,2)
conv.b = np.array([0, 0])

# Forward
out = conv.forward(x)
print(out)


[[[[ 0.  0.]
   [ 0.  0.]]

  [[12. 12.]
   [12. 12.]]]]


##### 【Problem 3 】 Output size after two-dimensional convolution

In [4]:
def calculate_output_size(H_in, W_in, FH, FW, SH, SW, PH, PW):
    H_out = (H_in + 2*PH - FH) // SH + 1
    W_out = (W_in + 2*PW - FW) // SW + 1
    return H_out, W_out


##### 【Problem 4 】 Creating a maximum pooling layer

In [5]:
class MaxPool2D:
    def __init__(self, kernel_size, stride):
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        FH, FW = self.kernel_size
        SH, SW = self.stride
        out_h = (H - FH) // SH + 1
        out_w = (W - FW) // SW + 1

        out = np.zeros((N, C, out_h, out_w))
        self.argmax = np.zeros_like(x)

        for i in range(out_h):
            for j in range(out_w):
                h_start = i * SH
                w_start = j * SW
                region = x[:, :, h_start:h_start+FH, w_start:w_start+FW]
                out[:, :, i, j] = np.max(region, axis=(2, 3))

                max_mask = (region == np.expand_dims(out[:, :, i, j], axis=(2, 3)))
                self.argmax[:, :, h_start:h_start+FH, w_start:w_start+FW] += max_mask

        return out

    def backward(self, dout):
        dx = np.zeros_like(self.x)
        FH, FW = self.kernel_size
        SH, SW = self.stride
        N, C, out_h, out_w = dout.shape

        for i in range(out_h):
            for j in range(out_w):
                h_start = i * SH
                w_start = j * SW
                max_mask = self.argmax[:, :, h_start:h_start+FH, w_start:w_start+FW]
                dx[:, :, h_start:h_start+FH, w_start:w_start+FW] += max_mask * np.expand_dims(dout[:, :, i, j], axis=(2, 3))

        return dx


##### 【Problem 5 】 (Advance Challenge) Create average pooling

In [6]:
class AveragePool2D:
    def __init__(self, kernel_size, stride):
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        FH, FW = self.kernel_size
        SH, SW = self.stride
        out_h = (H - FH) // SH + 1
        out_w = (W - FW) // SW + 1

        out = np.zeros((N, C, out_h, out_w))
        for i in range(out_h):
            for j in range(out_w):
                h_start = i * SH
                w_start = j * SW
                region = x[:, :, h_start:h_start+FH, w_start:w_start+FW]
                out[:, :, i, j] = np.mean(region, axis=(2, 3))
        return out

    def backward(self, dout):
        FH, FW = self.kernel_size
        SH, SW = self.stride
        dx = np.zeros_like(self.x)
        N, C, out_h, out_w = dout.shape

        for i in range(out_h):
            for j in range(out_w):
                h_start = i * SH
                w_start = j * SW
                dx[:, :, h_start:h_start+FH, w_start:w_start+FW] += (dout[:, :, i, j][:, :, None, None] / (FH * FW))

        return dx


##### 【Problem 6 】 Smoothing

In [7]:
class Flatten:
    def forward(self, x):
        self.orig_shape = x.shape
        return x.reshape(x.shape[0], -1)

    def backward(self, dout):
        return dout.reshape(self.orig_shape)


##### 【Problem 7 】 Learning and estimation

In [8]:
class LeNet:
    def __init__(self):
        self.conv1 = Conv2d(in_channels=1, out_channels=6, kernel_size=(5, 5), stride=1, padding=0)
        self.pool1 = AveragePool2D(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = Conv2d(in_channels=6, out_channels=16, kernel_size=(5, 5), stride=1, padding=0)
        self.pool2 = AveragePool2D(kernel_size=(2, 2), stride=(2, 2))
        self.flatten = Flatten()
        self.fc1 = FullyConnected(16 * 5 * 5, 120)
        self.fc2 = FullyConnected(120, 84)
        self.fc3 = FullyConnected(84, 10)

    def forward(self, x):
        x = self.conv1.forward(x)
        x = self.pool1.forward(x)
        x = self.conv2.forward(x)
        x = self.pool2.forward(x)
        x = self.flatten.forward(x)
        x = self.fc1.forward(x)
        x = self.fc2.forward(x)
        x = self.fc3.forward(x)
        return x

    def backward(self, dout):
        dout = self.fc3.backward(dout)
        dout = self.fc2.backward(dout)
        dout = self.fc1.backward(dout)
        dout = self.flatten.backward(dout)
        dout = self.pool2.backward(dout)
        dout = self.conv2.backward(dout)
        dout = self.pool1.backward(dout)
        dout = self.conv1.backward(dout)
        return dout


##### 【Problem 8 】 (Advance Challenge) LeNet

In [11]:
import numpy as np

# ReLU activation function
class ReLU:
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dout[self.mask] = 0
        return dout

# Softmax and Cross-Entropy Loss
class SoftmaxWithLoss:
    def forward(self, x, t):
        self.t = t
        self.y = self._softmax(x)
        self.loss = self._cross_entropy_error(self.y, self.t)
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

    def _softmax(self, x):
        x = x - np.max(x, axis=1, keepdims=True)
        exp_x = np.exp(x)
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def _cross_entropy_error(self, y, t):
        delta = 1e-7
        return -np.sum(t * np.log(y + delta)) / y.shape[0]

# 2D Convolution Layer
class Conv2D:
    def __init__(self, input_channels, output_channels, kernel_size, stride=1, padding=0):
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.W = np.random.randn(output_channels, input_channels, kernel_size, kernel_size) * 0.1
        self.b = np.zeros(output_channels)
    
    def forward(self, x):
        self.x = x
        batch_size, in_channels, height, width = x.shape
        out_height = (height - self.kernel_size + 2 * self.padding) // self.stride + 1
        out_width = (width - self.kernel_size + 2 * self.padding) // self.stride + 1
        
        self.output = np.zeros((batch_size, self.output_channels, out_height, out_width))
        
        for b in range(batch_size):
            for c in range(self.output_channels):
                for i in range(0, height - self.kernel_size + 1, self.stride):
                    for j in range(0, width - self.kernel_size + 1, self.stride):
                        self.output[b, c, i // self.stride, j // self.stride] = np.sum(
                            x[b, :, i:i + self.kernel_size, j:j + self.kernel_size] * self.W[c, :, :, :]
                        ) + self.b[c]
        return self.output

    def backward(self, dout):
        batch_size, out_channels, out_height, out_width = dout.shape
        dx = np.zeros_like(self.x)
        dW = np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        
        for b in range(batch_size):
            for c in range(self.output_channels):
                for i in range(0, self.x.shape[2] - self.kernel_size + 1, self.stride):
                    for j in range(0, self.x.shape[3] - self.kernel_size + 1, self.stride):
                        region = self.x[b, :, i:i + self.kernel_size, j:j + self.kernel_size]
                        dW[c] += dout[b, c, i // self.stride, j // self.stride] * region
                        dx[b, :, i:i + self.kernel_size, j:j + self.kernel_size] += dout[b, c, i // self.stride, j // self.stride] * self.W[c]
                db[c] += np.sum(dout[b, c])

        return dx, dW, db

# Max Pooling Layer
class MaxPool2D:
    def __init__(self, pool_size=2, stride=2):
        self.pool_size = pool_size
        self.stride = stride
    
    def forward(self, x):
        self.x = x
        batch_size, channels, height, width = x.shape
        out_height = (height - self.pool_size) // self.stride + 1
        out_width = (width - self.pool_size) // self.stride + 1
        
        self.output = np.zeros((batch_size, channels, out_height, out_width))
        self.mask = np.zeros_like(self.output)
        
        for b in range(batch_size):
            for c in range(channels):
                for i in range(0, height - self.pool_size + 1, self.stride):
                    for j in range(0, width - self.pool_size + 1, self.stride):
                        region = x[b, c, i:i + self.pool_size, j:j + self.pool_size]
                        self.output[b, c, i // self.stride, j // self.stride] = np.max(region)
                        self.mask[b, c, i // self.stride, j // self.stride] = np.argmax(region)
        return self.output

    def backward(self, dout):
        dx = np.zeros_like(self.x)
        for b in range(dout.shape[0]):
            for c in range(dout.shape[1]):
                for i in range(dout.shape[2]):
                    for j in range(dout.shape[3]):
                        max_index = self.mask[b, c, i, j]
                        dx[b, c, i * self.pool_size + max_index // self.pool_size, j * self.pool_size + max_index % self.pool_size] = dout[b, c, i, j]
        return dx

# Flatten Layer
class Flatten:
    def forward(self, x):
        self.input_shape = x.shape
        return x.reshape(x.shape[0], -1)
    
    def backward(self, dout):
        return dout.reshape(self.input_shape)

# Fully Connected Layer
class FullyConnected:
    def __init__(self, input_size, output_size):
        self.W = np.random.randn(input_size, output_size) * 0.1
        self.b = np.zeros(output_size)
    
    def forward(self, x):
        self.x = x
        return np.dot(x, self.W) + self.b
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis=0)
        return dx, dW, db

# LeNet Model
class LeNet:
    def __init__(self):
        self.conv1 = Conv2D(input_channels=1, output_channels=6, kernel_size=5, stride=1, padding=0)
        self.relu1 = ReLU()
        self.pool1 = MaxPool2D(pool_size=2, stride=2)

        self.conv2 = Conv2D(input_channels=6, output_channels=16, kernel_size=5, stride=1, padding=0)
        self.relu2 = ReLU()
        self.pool2 = MaxPool2D(pool_size=2, stride=2)

        self.flatten = Flatten()

        self.fc1 = FullyConnected(16 * 4 * 4, 120)  # Output size after Conv2 and Pool2 layers
        self.relu3 = ReLU()

        self.fc2 = FullyConnected(120, 84)
        self.relu4 = ReLU()

        self.fc3 = FullyConnected(84, 10)
        self.loss_fn = SoftmaxWithLoss()

        self.layers = [
            self.conv1, self.relu1, self.pool1,
            self.conv2, self.relu2, self.pool2,
            self.flatten,
            self.fc1, self.relu3,
            self.fc2, self.relu4,
            self.fc3
        ]

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def compute_loss(self, x, t):
        score = self.forward(x)
        return self.loss_fn.forward(score, t)

    def backward(self):
        dout = self.loss_fn.backward()
        for layer in reversed(self.layers):
            dout = layer.backward(dout)

    def update(self, lr=0.01):
        for layer in self.layers:
            if hasattr(layer, 'W'):
                layer.W -= lr * layer.dW
                layer.b -= lr * layer.db

# Training
def train(model, X_train, y_train, X_test, y_test, epochs=5, batch_size=64, lr=0.01):
    for epoch in range(epochs):
        perm = np.random.permutation(len(X_train))
        X_train = X_train[perm]
        y_train = y_train[perm]

        for i in range(0, len(X_train), batch_size):
            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            loss = model.compute_loss(X_batch, y_batch)
            model.backward()
            model.update(lr)

        # Simple accuracy check
        y_pred = model.forward(X_test)
        acc = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_test, axis=1))
        print(f"Epoch {epoch+1} - Loss: {loss:.4f} - Accuracy: {acc:.4f}")

# Example usage
# Assuming you have X_train, y_train, X_test, y_test loaded and preprocessed
# model = LeNet()
# train(model, X_train, y_train, X_test, y_test)


##### 【Problem 9 】 (Advance Challenge) Survey of famous image recognition models

# **Problem 9: Survey of Famous Image Recognition Models**

### 1. **AlexNet (2012)**

- **Overview**: AlexNet was a groundbreaking model that won the 2012 ImageNet competition by a significant margin. It popularized deep CNNs by using multiple layers of convolution and max-pooling. It also made use of ReLU activations to speed up training.
- **Key Features**:
  - **Convolutional Layers**: 5 convolutional layers followed by 3 fully connected layers.
  - **ReLU Activation**: Used to accelerate training.
  - **Dropout**: Introduced dropout to prevent overfitting in the fully connected layers.
  - **Data Augmentation**: Used to improve generalization by applying random transformations to the training data.
- **Impact**: It showed that deep networks with many layers and large datasets could lead to dramatic improvements in performance.

### 2. **VGG16 (2014)**

- **Overview**: VGG16 is a deeper architecture that uses very small 3x3 convolution filters across all layers. It proved that stacking small filters (3x3) repeatedly leads to better performance than using larger filters.
- **Key Features**:
  - **Convolutional Layers**: 16 layers (13 convolutional layers and 3 fully connected layers).
  - **Small Filters**: Used 3x3 filters for all convolutional layers.
  - **Fully Connected Layers**: 3 fully connected layers, with the last one being a softmax output.
  - **Max-Pooling**: After every few convolutional layers, a 2x2 max-pooling layer is used to reduce the spatial dimensions.
- **Impact**: VGG16 demonstrated the effectiveness of deep networks and is widely used as a backbone for various tasks in computer vision.

---


##### 【Problem 10 】 Output size and parameter count calculation

# **Problem 10: Output Size and Parameter Count Calculation**

### 1. **First Convolution Layer**:
- **Input Size**: 144 × 144, 3 channels
- **Filter Size**: 3 × 3, 6 channels
- **Stride**: 1
- **Padding**: None

  **Output Size**: 
  - Height: (144 - 3 + 0) / 1 + 1 = 142
  - Width: (144 - 3 + 0) / 1 + 1 = 142
  - Channels: 6
  - **Output Size** = 142 × 142 × 6

  **Number of Parameters**: 
  - Each filter has 3 × 3 weights (9) per input channel. For 6 filters and 3 input channels:
    - Weights: 3 × 3 × 3 × 6 = 162
    - Biases: 6 (one for each filter)
    - **Total Parameters** = 162 + 6 = 168

---

### 2. **Second Convolution Layer**:
- **Input Size**: 60 × 60, 24 channels
- **Filter Size**: 3 × 3, 48 channels
- **Stride**: 1
- **Padding**: None

  **Output Size**: 
  - Height: (60 - 3 + 0) / 1 + 1 = 58
  - Width: (60 - 3 + 0) / 1 + 1 = 58
  - Channels: 48
  - **Output Size** = 58 × 58 × 48

  **Number of Parameters**: 
  - Each filter has 3 × 3 weights per input channel. For 48 filters and 24 input channels:
    - Weights: 3 × 3 × 24 × 48 = 10368
    - Biases: 48 (one for each filter)
    - **Total Parameters** = 10368 + 48 = 10416

---

### 3. **Third Convolution Layer**:
- **Input Size**: 20 × 20, 10 channels
- **Filter Size**: 3 × 3, 20 channels
- **Stride**: 2
- **Padding**: None

  **Output Size**: 
  - Height: (20 - 3 + 0) / 2 + 1 = 9
  - Width: (20 - 3 + 0) / 2 + 1 = 9
  - Channels: 20
  - **Output Size** = 9 × 9 × 20

  **Number of Parameters**: 
  - Each filter has 3 × 3 weights per input channel. For 20 filters and 10 input channels:
    - Weights: 3 × 3 × 10 × 20 = 1800
    - Biases: 20 (one for each filter)
    - **Total Parameters** = 1800 + 20 = 1820

---

##### 【Problem 11 】 (Advance Challenge) Survey on filter size

# **Problem 11: Survey on Filter Size**

### Why 3×3 filters are commonly used, rather than larger ones like 7×7?

1. **Smaller Filters Capture Fine Details**: 
   - A **3x3 filter** is smaller and captures local features at a fine-grained level, making it suitable for learning smaller, more detailed features of images.
   
2. **Stacking 3x3 Filters to Capture Larger Receptive Fields**:
   - By stacking multiple 3x3 filters, a network can effectively capture a larger receptive field with fewer parameters. For instance, two stacked 3x3 filters can have a receptive field of 5x5, while still maintaining a smaller number of parameters compared to a single 5x5 or 7x7 filter.
   
3. **Efficient Parameter Usage**: 
   - Using 3x3 filters keeps the model efficient with fewer parameters. A single 7x7 filter has more parameters and is computationally more expensive. Using 3x3 filters in depth allows for more flexible and efficient networks.

4. **Empirical Success**: 
   - 3x3 filters have been empirically successful in models like VGG and ResNet. They strike a good balance between capturing local features and keeping the model relatively lightweight.

### Effect of 1x1 filter without height or width?

1. **Dimensionality Reduction**:
   - A **1x1 filter** has no spatial extent (height or width) and only operates on the depth of the input. It’s used for dimensionality reduction or increasing the depth of the feature maps without altering their spatial size.
   
2. **Channel-wise Transformation**:
   - It allows transforming the number of channels in the feature map, essentially working as a **pointwise** convolution. This is especially useful in architectures like **Inception** where 1x1 convolutions are used to reduce the computational burden by limiting the depth of intermediate layers.
   
3. **Increased Non-linearity**:
   - The use of 1x1 convolutions increases the ability of the network to model more complex relationships by introducing more non-linearity without changing the spatial resolution.

In summary, **3x3 filters** are widely used due to their computational efficiency and ability to capture detailed features, while **1x1 filters** provide an efficient way to manage depth and channel transformations in the network.