# unimib/DSIM 2025-2026: Task 2

Model Compression - E. Mosca 925279

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset
import torch.utils.data as data
import time
import numpy as np
import os


from model import PlantClassifier, QuantizedPlantClassifier

In [None]:
path_on_gdrive = 'gdrive/My Drive/dsim/Plant_leave_diseases_dataset_without_augmentation.zip'
dataset_filename = 'Plant_leave_diseases_dataset_without_augmentation' # do not include the '.zip' extension in the name

In [None]:
from google.colab import drive
import zipfile
from shutil import copyfile
drive.mount('/content/gdrive')
copyfile(path_on_gdrive, dataset_filename+'.zip')
zipf = zipfile.ZipFile(dataset_filename+'.zip')
zipf.extractall()
zipf.close()

Mounted at /content/gdrive


In [None]:
dataset = ImageFolder(root=dataset_filename, transform=transforms.ToTensor())

In [None]:
copyfile("gdrive/MyDrive/dsim/task2/model.py", "model.py")
from model import PlantClassifier, QuantizedPlantClassifier

In [None]:
from shutil import copytree
copytree("gdrive/MyDrive/dsim/task2/models", "models")

Training of the classifier has finished, now the goal is to reduce its size while maintaining performance so that it might be used in resource-constrained applications such as embedded systems.

The base model was trained for 10 epochs, but the best model(with the lowest validation loss) was saved after epoch 9.

On the test set, it achieved the following: Test Loss: 0.1145, Test Accuracy: 96.50%

Now different techniques will be adopted to maintain classification performance while enhancing speed, and reducing size on memory

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
model = PlantClassifier().to(device)
model.load_state_dict(torch.load("models/best_model.pth"))

<All keys matched successfully>

In [None]:
base_model_nparams = sum([param.numel() for param in model.parameters()])
base_model_size = os.path.getsize('models/best_model.pth') / (1024 * 1024)  # size in MB
print("--Best(base) model--", "\n")
print(f"Number of params: {base_model_nparams}, size in MBs: {base_model_size}")

--Best(base) model-- 

Number of params: 2109351, size in MBs: 8.06987476348877


In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params}')
for name, param in model.named_parameters():
    print(name, param.numel())

Total parameters: 2109351
features.0.conv0.weight 432
features.0.conv0.bias 16
features.0.bn0.weight 16
features.0.bn0.bias 16
features.1.conv1.weight 4608
features.1.conv1.bias 32
features.1.bn1.weight 32
features.1.bn1.bias 32
features.2.conv2.weight 18432
features.2.conv2.bias 64
features.2.bn2.weight 64
features.2.bn2.bias 64
features.3.conv3.weight 73728
features.3.conv3.bias 128
features.3.bn3.weight 128
features.3.bn3.bias 128
features.4.conv4.weight 294912
features.4.conv4.bias 256
features.4.bn4.weight 256
features.4.bn4.bias 256
features.5.conv5.weight 1179648
features.5.conv5.bias 512
features.5.bn5.weight 512
features.5.bn5.bias 512
classifier.1.weight 524288
classifier.1.bias 256
classifier.4.weight 9984
classifier.4.bias 39


In [None]:
model_layers = [module for module in model.modules() if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear)]
print("Number of layers: "+str(len(model_layers)))

Number of layers: 8


### Global Unstructured Pruning

First, an analysis on results of pruning will be done.

Pruning does not reduce model file size inherently, but applies a mask over the weights in order to turn some of the latter off by setting them to 0. Speed can be improved

The pruning technique that will be used is "Global Pruning", and will be done in an unstructured way, that is to say that individual weights will be turned off throughtout all layers of the network.

Pruning will be omitted for biases

In [None]:
# Check the model's initial sparsity before pruning.
print("BEFORE GLOBAL PRUNING:\n")

for layer in model_layers:
    print(f"Layer: {layer}")
    print(f"  Weights - sparsity: {100. * float(torch.sum(layer.weight == 0)) / float(layer.weight.nelement()):.2f}%")
    if layer.bias is not None:
        print(f"  Bias   - sparsity: {100. * float(torch.sum(layer.bias == 0)) / float(layer.bias.nelement()):.2f}%")

BEFORE GLOBAL PRUNING:

Layer: Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 0.00%
  Bias   - sparsity: 0.00%
Layer: Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 0.00%
  Bias   - sparsity: 0.00%
Layer: Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 0.00%
  Bias   - sparsity: 0.00%
Layer: Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 0.00%
  Bias   - sparsity: 0.00%
Layer: Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 0.00%
  Bias   - sparsity: 0.00%
Layer: Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 0.00%
  Bias   - sparsity: 0.00%
Layer: Linear(in_features=2048, out_features=256, bias=True)
  Weights - sparsity: 0.00%
  Bias   - sparsity: 0.00%
Layer: Linear(in_features=256, out_features=39, bias=True)
  Weights - sparsity: 0.00%
  Bias 

The model is currently fully dense. Setting weights to 0 will make it more sparse.

As a goal, having around 1M params while still keeping the model's performance can be a satisfying achievement, so we will set the proportion of parameters to prune to 50% as the model has currently around 2M params.

In [None]:
parameters_to_prune = [(layer, 'weight') for layer in model_layers]
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.5,
)

In [None]:
# after global unstructured pruning
for layer in model_layers:
    print(f"Layer: {layer}")
    print(f"  Weights - sparsity: {100. * float(torch.sum(layer.weight == 0)) / float(layer.weight.nelement()):.2f}%")

Layer: Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 21.30%
Layer: Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 37.00%
Layer: Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 40.00%
Layer: Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 41.35%
Layer: Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 40.84%
Layer: Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 39.80%
Layer: Linear(in_features=2048, out_features=256, bias=True)
  Weights - sparsity: 79.53%
Layer: Linear(in_features=256, out_features=39, bias=True)
  Weights - sparsity: 64.24%


Its noticeable that the layers most affected (on the basis of L1 criterion, so absolute value) are the final linear fully-connected ones, this is good also because they are the heaviest parts of themodel along with the 5th convolution.

The middle Conv layers are also more affected than the initial conv layers, this might be because by the time the input reaches them, the size of the feature maps is smaller, so each filterelement will have less infomation to absorb.

In [None]:
# with the following code pruning can be made permanent
for module, param_name in parameters_to_prune:
    prune.remove(module, param_name)

In [None]:
# how many non-zero parameters left?
n_nonzero_params = sum(int(torch.sum(layer.weight != 0)) for layer in model_layers)
print(f'Number of non-zero parameters after pruning: {n_nonzero_params}')

Number of non-zero parameters after pruning: 1053016


Now, immediate performance changes can be analyzed by using this model on the test set

In [None]:
### From previous notebooks, splits done with same seeds...###

 Stratified split into train (80%), val (10%), test (10%)
from sklearn.model_selection import train_test_split

labels = [sample[1] for sample in dataset.samples]

#80% train, 20% temp
train_indices, temp_indices = train_test_split(
    range(len(dataset)),
    test_size=0.2,
    stratify=labels,
    random_state=42
)

# split the 20% into 50-50 for val (10%) and test (10%)
temp_labels = [labels[i] for i in temp_indices]
val_indices, test_indices = train_test_split(
    temp_indices,
    test_size=0.5,
    stratify=temp_labels,
    random_state=42
)

train_dataset = data.Subset(dataset, train_indices)
val_dataset = data.Subset(dataset, val_indices)
test_dataset = data.Subset(dataset, test_indices)

train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.RandomRotation(15),
    transforms.RandomVerticalFlip(p=0.25),
    transforms.RandomHorizontalFlip(p=0.25)
])
val_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])

train_dataset.dataset.transform = train_transform
val_dataset.dataset.transform = val_transform
test_dataset.dataset.transform = val_transform

In [None]:
batch_size = 32
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
len(test_dataset)

5545

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
test_accuracies = []
test_losses = []
batch_sizes = []
model.eval()
with torch.no_grad():
    test_time_start = time.time()
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        test_losses.append(loss_fn(outputs, labels).item())
        test_accuracies.append((outputs.argmax(dim=1) == labels).float().mean().item())
        batch_sizes.append(len(labels))

pruned_model_test_time = time.time() - test_time_start
print(f"Pruned model test time: {pruned_model_test_time:.2f} seconds")
test_accuracies_weighted = [acc * size for acc, size in zip(test_accuracies, batch_sizes)]
test_losses_weighted = [loss * size for loss, size in zip(test_losses, batch_sizes)]
print(f'Test Loss: {sum(test_losses_weighted)/sum(batch_sizes):.4f}, Test Accuracy: {sum(test_accuracies_weighted)/sum(batch_sizes)*100:.2f}%')

Pruned model test time: 63.33 seconds
Test Loss: 0.2371, Test Accuracy: 92.84%


The model encounters a reduction in accuracy of almost 4%, while the loss on the test set doubled.

While this still isn't too bad on 39 classes, it can be improved by fine-tuning the new architecture.

In [None]:
#torch.save(model.state_dict(), './models/pruned_model.pth')

In [None]:
pruned_model = PlantClassifier().to(device)
pruned_model.load_state_dict(torch.load("models/pruned_model.pth"))

<All keys matched successfully>

In [None]:
#pruned model size... is same as initial since we are just zeroing weights
pruned_model_size = os.path.getsize('models/pruned_model.pth') / (1024 * 1024)  # size in MB
print(f'Pruned model size in MBs: {pruned_model_size}')

Pruned model size in MBs: 8.06916332244873


In [None]:
num_ft_epochs = 2
learning_rate = 0.001
optimizer = torch.optim.Adam(pruned_model.parameters(), lr=learning_rate)

In [None]:
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# finetuning the pruned model must be done while keeping the 0-weights frozen
pruned_model_layers = [module for module in pruned_model.modules() if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear)]
for layer in pruned_model_layers:
    mask = layer.weight.data != 0 # setting up mask, so that incoming gradients to 0-weights are also zeroed
    layer.weight.register_hook(lambda grad, mask=mask: grad * mask.float())

for epoch in range(num_ft_epochs):
    pruned_model.train()
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = pruned_model(images)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Finetune Epoch [{epoch+1}/{num_ft_epochs}], Loss: {loss.item():.4f}')

Finetune Epoch [1/2], Loss: 0.0720
Finetune Epoch [2/2], Loss: 0.0125


**Note**: fine-tuning on the *train* set instead of the validation set would probably have been the more responsible choice here...

In [None]:
#check if sparsity was preserved
for layer in pruned_model_layers:
    print(f"Layer: {layer}")
    print(f"  Weights - sparsity: {100. * float(torch.sum(layer.weight == 0)) / float(layer.weight.nelement()):.2f}%")

Layer: Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 21.30%
Layer: Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 37.00%
Layer: Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 40.00%
Layer: Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 41.35%
Layer: Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 40.84%
Layer: Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  Weights - sparsity: 39.80%
Layer: Linear(in_features=2048, out_features=256, bias=True)
  Weights - sparsity: 79.53%
Layer: Linear(in_features=256, out_features=39, bias=True)
  Weights - sparsity: 64.24%


In [None]:
#check if number of non-zero params is same
n_nonzero_params_after_ft = sum(int(torch.sum(layer.weight != 0)) for layer in pruned_model_layers)
print(f'Number of non-zero parameters after finetuning: {n_nonzero_params_after_ft}')

Number of non-zero parameters after finetuning: 1053016


In [None]:
# now retrying on the test set
test_accuracies = []
test_losses = []
batch_sizes = []
pruned_model.eval()
with torch.no_grad():
    test_time_start = time.time()
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = pruned_model(images)
        test_losses.append(loss_fn(outputs, labels).item())
        test_accuracies.append((outputs.argmax(dim=1) == labels).float().mean().item())
        batch_sizes.append(len(labels))
pruned_model_test_time = time.time() - test_time_start
print(f"Pruned model test time: {pruned_model_test_time:.2f} seconds")
test_accuracies_weighted = [acc * size for acc, size in zip(test_accuracies, batch_sizes)]
test_losses_weighted = [loss * size for loss, size in zip(test_losses, batch_sizes)]
print(f'Test Loss: {sum(test_losses_weighted)/sum(batch_sizes):.4f}, Test Accuracy: {sum(test_accuracies_weighted)/sum(batch_sizes)*100:.2f}%')

Pruned model test time: 17.28 seconds
Test Loss: 0.0666, Test Accuracy: 97.69%


Reasonably, fine tuning the pruned model for 2 epochs on the validation set results in slightly better performance in both loss and accuracy on the test set.

In [None]:
#torch.save(pruned_model.state_dict(), './models/pruned_finetuned_model.pth')

### Quantization

Quantization can help reduce model file size by going from full precision weights(float32) to integers like uint8. This will be done both on the best-base model and the pruned ones in order to compare performance and speed.

In [None]:
q_model = QuantizedPlantClassifier() # this is just plant classifier with added quantstubs
q_model.load_state_dict(torch.load("models/best_model.pth"))

<All keys matched successfully>

In [None]:
q_model

QuantizedPlantClassifier(
  (quant): QuantStub()
  (features): Sequential(
    (0): Sequential(
      (conv0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
      (bn0): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu2): ReLU(inplace=True)
      (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )

The quantization technique applied will be static quantization and the target dtype will be uint8; doing this in PyTorch allows for quantizing conv layers as well.

Static quantization involves a calibration step where representative data is ran through the network. The layer weights can be scaled into uint8 range (-128 to 127) simply by finding a scale factor (divide max abs value weight by 127), so this problem is solved simply by having the pretrained model's weights.

Static quantization though requires a calibration step as well, since we can't find a scale to use for activation values from the weights alone(these depend on input data).

In [None]:
print(torch.backends.quantized.engine)

fbgemm


Quantization models raise a few errors on pytorch. For these reasons the backends.quantized.engine is gonna change to fbgemm

In [None]:
torch.backends.quantized.engine = 'fbgemm' #encountered issues on x86

In [None]:
q_model.eval()

QuantizedPlantClassifier(
  (quant): QuantStub()
  (features): Sequential(
    (0): Sequential(
      (conv0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
      (bn0): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu2): ReLU(inplace=True)
      (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )

In [None]:
# fuse modules
def fuse_model(model):
    """Fuse model modules for quantization."""
    # For models with Sequential blocks, you need to build fusion lists
    fusion_list = []

    for name, module in model.named_modules():
        if isinstance(module, nn.Sequential):
            # Build fusion patterns for this Sequential block
            for idx in range(len(module) - 1):
                if isinstance(module[idx], nn.Conv2d) and isinstance(module[idx + 1], nn.ReLU):
                    fusion_list.append([f"{name}.{idx}", f"{name}.{idx + 1}"])
                if isinstance(module[idx], nn.Linear) and isinstance(module[idx + 1], nn.ReLU):
                    fusion_list.append([f"{name}.{idx}", f"{name}.{idx + 1}"])

    # Fuse the layers
    torch.quantization.fuse_modules(model, fusion_list, inplace=True)
    return model

In [None]:
q_model = fuse_model(q_model)

In [None]:
# Attaching observer modules, activation value mins and maxes will be recorded
q_model.qconfig = torch.quantization.get_default_qconfig("fbgemm")

In [None]:
torch.quantization.prepare(q_model, inplace=True)



QuantizedPlantClassifier(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
  )
  (features): Sequential(
    (0): Sequential(
      (conv0): Conv2d(
        3, 16, kernel_size=(3, 3), stride=(1, 1)
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (bn0): BatchNorm2d(
        16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): Conv2d(
        16, 32, kernel_size=(3, 3), stride=(1, 1)
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (bn1): BatchNorm2d(
        32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (activation_post_process): HistogramObserver(min_val=in

In [None]:
# calibration done on val dataset, as best_model was never trained on it
with torch.no_grad():
    for images, labels in val_loader:
        images = images.to("cpu") # done on cpu, as quantized models dont run on gpu, this model isnt quantized yet but this avoids some errors
        q_model(images)

**Note**: the choice of calibrating on the *validation* set here stems from the fact that calibration should be done on test-representative data, validation set therefore was used as model was already trained on train data and test data will be used for evaluation.

In [None]:
torch.quantization.convert(q_model, inplace=True) #now model is quantized

QuantizedPlantClassifier(
  (quant): Quantize(scale=tensor([0.0079]), zero_point=tensor([0]), dtype=torch.quint8)
  (features): Sequential(
    (0): Sequential(
      (conv0): QuantizedConv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), scale=0.040684640407562256, zero_point=48)
      (bn0): QuantizedBatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): QuantizedConv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), scale=0.3340418040752411, zero_point=75)
      (bn1): QuantizedBatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (conv2): QuantizedConv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.439125657081604, zer

As can be seen above, layers have been converted to their quantized counterpart, and the dtype is now uint8 all throughout

In [None]:
#torch.save(q_model.state_dict(), './models/_fbgemm_quantized_model_dict.pth')

In [None]:
#quantized model size
quantized_model_size = os.path.getsize('models/_fbgemm_quantized_model_dict.pth') / (1024 * 1024)  # size in MB
print(f'Quantized model size in MBs: {quantized_model_size}')

Quantized model size in MBs: 2.077643394470215


The model size was reduced by around 6MBs

In [None]:
print(torch.backends.quantized.engine)

fbgemm


In [None]:
print("--- Weight dtypes after static conversion ---") # layers are named, but are Quantized versions
# Iterate through the quantized model's layers
for name, module in q_model.named_modules():
    print(f"{name}: {type(module)}")

--- Weight dtypes after static conversion ---
: <class 'model.QuantizedPlantClassifier'>
quant: <class 'torch.ao.nn.quantized.modules.Quantize'>
features: <class 'torch.nn.modules.container.Sequential'>
features.0: <class 'torch.nn.modules.container.Sequential'>
features.0.conv0: <class 'torch.ao.nn.quantized.modules.conv.Conv2d'>
features.0.bn0: <class 'torch.ao.nn.quantized.modules.batchnorm.BatchNorm2d'>
features.0.relu0: <class 'torch.nn.modules.activation.ReLU'>
features.0.pool0: <class 'torch.nn.modules.pooling.MaxPool2d'>
features.1: <class 'torch.nn.modules.container.Sequential'>
features.1.conv1: <class 'torch.ao.nn.quantized.modules.conv.Conv2d'>
features.1.bn1: <class 'torch.ao.nn.quantized.modules.batchnorm.BatchNorm2d'>
features.1.relu1: <class 'torch.nn.modules.activation.ReLU'>
features.1.pool1: <class 'torch.nn.modules.pooling.MaxPool2d'>
features.2: <class 'torch.nn.modules.container.Sequential'>
features.2.conv2: <class 'torch.ao.nn.quantized.modules.conv.Conv2d'>
fea

In [None]:
# LOADING QUANTIZED
q_model = QuantizedPlantClassifier()
q_model.cpu()
q_model.eval()
q_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(q_model, inplace=True)
torch.quantization.convert(q_model, inplace=True)



QuantizedPlantClassifier(
  (quant): Quantize(scale=tensor([1.]), zero_point=tensor([0]), dtype=torch.quint8)
  (features): Sequential(
    (0): Sequential(
      (conv0): QuantizedConv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0)
      (bn0): QuantizedBatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): QuantizedConv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0)
      (bn1): QuantizedBatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (conv2): QuantizedConv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0)
      (bn2): QuantizedBatchNorm2d(64, eps

In [None]:
q_model.load_state_dict(torch.load("models/_fbgemm_quantized_model_dict.pth"))

<All keys matched successfully>

In [None]:
# evaluating quantized model on test set
q_test_accuracies = []
q_test_losses = []
q_batch_sizes = []
with torch.no_grad():
    test_time_start = time.time()
    for images, labels in test_loader:
        images = images.cpu()
        labels = labels.cpu()
        outputs = q_model(images)
        q_test_losses.append(loss_fn(outputs, labels).item())
        q_test_accuracies.append((outputs.argmax(dim=1) == labels).float().mean().item())
        q_batch_sizes.append(len(labels))

quantized_model_test_time = time.time() - test_time_start
print(f"Quantized model test time: {quantized_model_test_time:.2f} seconds")
q_test_accuracies_weighted = [acc * size for acc, size in zip(q_test_accuracies, q_batch_sizes)]
q_test_losses_weighted = [loss * size for loss, size in zip(q_test_losses, q_batch_sizes)]
print(f'Test Loss: {sum(q_test_losses_weighted)/sum(q_batch_sizes):.4f}, Test Accuracy: {sum(q_test_accuracies_weighted)/sum(q_batch_sizes)*100:.2f}%')

Quantized model test time: 56.59 seconds
Test Loss: 0.1984, Test Accuracy: 93.71%


The reduction in accuracy is not big, just under 3%, while loss almost doubles. A final result can be obtained by quantizing the pruned and finetuned model. As a reminder, this model was ft'd for 2 epochs on the validation dataset

In [None]:
q_pruned_model = QuantizedPlantClassifier()
q_pruned_model.load_state_dict(torch.load("models/pruned_finetuned_model.pth"))

<All keys matched successfully>

In [None]:
q_pruned_model.eval()
q_pruned_model = fuse_model(q_pruned_model)
q_pruned_model.qconfig = torch.quantization.get_default_qconfig("fbgemm")

In [None]:
torch.quantization.prepare(q_pruned_model,inplace=True)

QuantizedPlantClassifier(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
  )
  (features): Sequential(
    (0): Sequential(
      (conv0): Conv2d(
        3, 16, kernel_size=(3, 3), stride=(1, 1)
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (bn0): BatchNorm2d(
        16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): Conv2d(
        16, 32, kernel_size=(3, 3), stride=(1, 1)
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (bn1): BatchNorm2d(
        32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (activation_post_process): HistogramObserver(min_val=in

In [None]:
# calibration done on val dataset, it was mostly not trained on it (2 out of 11 epochs), also cannot do this on test set...
with torch.no_grad():
    for images, labels in val_loader:
        images = images.to("cpu")
        q_pruned_model(images)

In [None]:
torch.quantization.convert(q_pruned_model, inplace=True)

QuantizedPlantClassifier(
  (quant): Quantize(scale=tensor([0.0079]), zero_point=tensor([0]), dtype=torch.quint8)
  (features): Sequential(
    (0): Sequential(
      (conv0): QuantizedConv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), scale=0.03866381198167801, zero_point=49)
      (bn0): QuantizedBatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): QuantizedConv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), scale=0.32406851649284363, zero_point=77)
      (bn1): QuantizedBatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (conv2): QuantizedConv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.4443858861923218, ze

In [None]:
#torch.save(q_pruned_model.state_dict(), './models/_fbgemm_quantized_pruned_model_dict.pth')

In [None]:
# LOADING QUANTIZED
q_pruned_model = QuantizedPlantClassifier()
q_pruned_model.cpu()
q_pruned_model.eval()
q_pruned_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(q_pruned_model, inplace=True)
torch.quantization.convert(q_pruned_model, inplace=True)



QuantizedPlantClassifier(
  (quant): Quantize(scale=tensor([1.]), zero_point=tensor([0]), dtype=torch.quint8)
  (features): Sequential(
    (0): Sequential(
      (conv0): QuantizedConv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0)
      (bn0): QuantizedBatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu0): ReLU(inplace=True)
      (pool0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (conv1): QuantizedConv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0)
      (bn1): QuantizedBatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (conv2): QuantizedConv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0)
      (bn2): QuantizedBatchNorm2d(64, eps

In [None]:
q_pruned_model.load_state_dict(torch.load("models/_fbgemm_quantized_pruned_model_dict.pth"))

<All keys matched successfully>

In [None]:
#inspecting file size on disk
quantized_pruned_model_size = os.path.getsize('models/_fbgemm_quantized_pruned_model_dict.pth') / (1024 * 1024)  # size in MB
print(f'Quantized pruned model size in MBs: {quantized_pruned_model_size}')

Quantized pruned model size in MBs: 2.0553064346313477


Reasonably, a similar reduction happens, a way to remove entries for 0-parameters would definitely decrease model file size to minimal dimensions

In [None]:
# evaluating quantized-pruned-best_model on test set
qp_test_accuracies = []
qp_test_losses = []
qp_batch_sizes = []
with torch.no_grad():
    test_time_start = time.time()
    for images, labels in test_loader:
        images = images.cpu()
        labels = labels.cpu()
        outputs = q_pruned_model(images)
        qp_test_losses.append(loss_fn(outputs, labels).item())
        qp_test_accuracies.append((outputs.argmax(dim=1) == labels).float().mean().item())
        qp_batch_sizes.append(len(labels))
quantized_pruned_model_test_time = time.time() - test_time_start
print(f"Quantized pruned model test time: {quantized_pruned_model_test_time:.2f} seconds")
qp_test_accuracies_weighted = [acc * size for acc, size in zip(qp_test_accuracies, qp_batch_sizes)]
qp_test_losses_weighted = [loss * size for loss, size in zip(qp_test_losses, qp_batch_sizes)]
print(f'Test Loss: {sum(qp_test_losses_weighted)/sum(qp_batch_sizes):.4f}, Test Accuracy: {sum(qp_test_accuracies_weighted)/sum(qp_batch_sizes)*100:.2f}%')

Quantized pruned model test time: 57.11 seconds
Test Loss: 0.0938, Test Accuracy: 96.54%


The quantized-pruned(& finetuned) model achieves the best value so far for test loss, and obtains the same level of accuracy as the initial base model despite the 75% model size reduction

### Speed comparison

Since quantized models bring issues when using the gpu, the speed comparison will be done on the cpu for all models.

In [None]:
# averaging inference times per image using dummy inputs
def test_inference_time(model, input_size=1000):
    loader = DataLoader(
        test_dataset,
        batch_size=input_size,
        shuffle=False
    )
    model.cpu()
    model.eval()
    start_time = time.time()
    with torch.no_grad():
        model(next(iter(loader))[0])
    end_time = time.time()
    total_time = end_time - start_time
    return total_time / input_size

In [None]:
model_speeds = {
    "base": 0.0,
    "pruned": 0.0,
    "quantized": 0.0,
    "quantized_pruned": 0.0
}

Running cells independently to measure speed

In [None]:
model_speeds["base"] = test_inference_time(model)  # for base model

In [None]:
model_speeds["pruned"] = test_inference_time(pruned_model)  # for pruned model

In [None]:
model_speeds["quantized"] = test_inference_time(q_model)  # for quantized model

In [None]:
model_speeds["quantized_pruned"] = test_inference_time(q_pruned_model)  # for quantized pruned model

In [None]:
# sort and display
sorted_speeds = dict(sorted(model_speeds.items(), key=lambda item: item[1]))
for model_type, speed in sorted_speeds.items():
    print(f"{model_type} model inference time per image: {speed*1000:.9f} ms")

quantized_pruned model inference time per image: 10.065129757 ms
quantized model inference time per image: 13.045636892 ms
pruned model inference time per image: 15.727429628 ms
base model inference time per image: 23.225077152 ms


The quantized models achieve the fastest inference, the quantized_pruned achieves higher accuracy and lower loss so it is preferable. Also, the pruned model is much faster than the base model, so the contrbution of pruning is also clear