# unimib/DSIM 2025-2026: Task 2

Plant disease classifier implementation - E. Mosca 925279

This notebook serves as a way to check out the base architecture used for model training. The implementation and analysis of its elements follows

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import collections

The model architecture is quite simple, it is made of 6 CONV2D+BN+RELU+MAXPOOL layers, followed by a linear+relu layer, that is connected to the final layer(with 0.5 dropout during training)

6 conv layers were used as that was the required amount to build a receptive field that covered the entire input image(at least 256 in both directions); thats to say the final convolutional feature maps had elements that resulted from processing of the entire input image

In [10]:
class PlantClassifier(nn.Module):
    def __init__(self, num_classes=39, in_features=3, base_filters=16, # conv stride stays 1, kernel size stays 3
                 n_conv_layers=6, num_fc_units=256):
        super(PlantClassifier, self).__init__()
        # want to modularize the conv layers
        def make_conv_block(in_channels, out_channels, layer_idx=""):
            return nn.Sequential(
                collections.OrderedDict([
                    ('conv'+layer_idx, nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding="valid")),
                    ('bn'+layer_idx, nn.BatchNorm2d(out_channels)),
                    ('relu'+layer_idx, nn.ReLU(inplace=True)),
                    ('pool'+layer_idx, nn.MaxPool2d(kernel_size=2, stride=2))
                ])
            )
        layers = []
        current_in_channels = in_features
        for i in range(n_conv_layers):
            layers.append(make_conv_block(current_in_channels, base_filters * (2 ** i), layer_idx=str(i)))
            current_in_channels = base_filters * (2 ** i)
        self.features = nn.Sequential(*layers)

        with torch.no_grad():
            dummy_input = torch.randn(1, in_features, 256, 256)
            dummy_output = self.features(dummy_input)
            flattened_size = dummy_output.view(1, -1).shape[1]
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, num_fc_units),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(num_fc_units, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [11]:
# visualize model structure
print(PlantClassifier())

PlantClassifier(
  (features): Sequential(
    (0): Sequential(
      (conv0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=valid)
      (bn0): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=valid)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=valid)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu2): ReLU(inplace=True)
      (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode

In [12]:
# inspect model params
model = PlantClassifier()
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params}')
for name, param in model.named_parameters():
    print(name, param.numel())

Total parameters: 2109351
features.0.conv0.weight 432
features.0.conv0.bias 16
features.0.bn0.weight 16
features.0.bn0.bias 16
features.1.conv1.weight 4608
features.1.conv1.bias 32
features.1.bn1.weight 32
features.1.bn1.bias 32
features.2.conv2.weight 18432
features.2.conv2.bias 64
features.2.bn2.weight 64
features.2.bn2.bias 64
features.3.conv3.weight 73728
features.3.conv3.bias 128
features.3.bn3.weight 128
features.3.bn3.bias 128
features.4.conv4.weight 294912
features.4.conv4.bias 256
features.4.bn4.weight 256
features.4.bn4.bias 256
features.5.conv5.weight 1179648
features.5.conv5.bias 512
features.5.bn5.weight 512
features.5.bn5.bias 512
classifier.1.weight 524288
classifier.1.bias 256
classifier.4.weight 9984
classifier.4.bias 39


In [7]:
#printing image dimensions through convolution
x = torch.randn(1, 3, 256, 256)  # Example input tensor
print(x.shape)
for layer in model.features:
    x = layer(x)
    print(x.shape)
for layer in model.classifier:
    x = layer(x)
    print(x.shape)

torch.Size([1, 3, 256, 256])
torch.Size([1, 16, 127, 127])
torch.Size([1, 32, 62, 62])
torch.Size([1, 64, 30, 30])
torch.Size([1, 128, 14, 14])
torch.Size([1, 256, 6, 6])
torch.Size([1, 256, 2, 2])
torch.Size([1, 1024])
torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 39])


### Dev notes

- Parsimonious model architecture development, having in mind deployment on edge devices 
- Initial concern is to get a full receptive field, knowing input image size, so that each final output was result of computations on entire input.