In [5]:
from ultralytics import YOLO
import os, shutil
import numpy as np

from datetime import datetime

In [6]:
from utilities.utils import create_directory
from utilities.utils import find_unstructured_prune_ratio
from utilities.utils import unstructured_prune_model
from utilities.utils import get_subdirs_sorted_by_creation_time
from utilities.utils import copy_file
from utilities.utils import create_directory_if_not_exists
from utilities.utils import is_train_dir
from utilities.utils import has_weights
from utilities.utils import compress_file
from utilities.utils import get_compressed_file_size
from utilities.utils import measure_inference_speeds

## Evaluate the performance of the baseline model

In [5]:
model = YOLO('./keypoint_estimation_model/last.pt')

In [6]:
metrics = model.val(data='config.yaml')

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
YOLOv8n-pose summary (fused): 187 layers, 3268931 parameters, 0 gradients, 9.1 GFLOPs


[34m[1mval: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/val.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 75/75 [03:23<00:00,  2.72s/it]


                   all       1200       1711      0.903      0.846      0.899      0.372      0.867      0.815      0.818       0.38
Speed: 0.4ms preprocess, 165.5ms inference, 0.0ms loss, 0.3ms postprocess per image
Results saved to [1mruns/pose/val[0m


In [7]:
metrics.pose.map, metrics.pose.map50

(0.3799101581327904, 0.8180010357036621)

In [8]:
import torch
from torch.nn.utils import prune

In [10]:
model

YOLO(
  (model): PoseModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (m): ModuleList(
          (0): Bottleneck(
            (cv1): Conv(
              (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              (act): SiLU(inplace=True)
            )
            (cv2): Conv(
              (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              (act): SiLU(inplace=True)
        

In [None]:
for name, module in model.named_modules():
    print(name)

## Unstructured pruning

<h4> We'll implement unstructured pruning and then fine-tune the model for 1 epoch to recover any losses in accuracy while maintaing almost same pruning ratio</h4>

In [22]:
# Load your model
model_2 = YOLO('./keypoint_estimation_model/last.pt')

# I have taken higher prune ratio in later layers because later convolutional layers extract more abstract features, so if an weights of an abstract
# feature map are close to 0, I prune them, also certain layers have higher number of input feature maps like 256, 128 compared to other layers
# so i prune them more(like the 1st, 2nd module)
pruning_ratios = [0.30, 0.25, 0.10, 0.15, 0.30, 0.20, 0.25, 0.35, 0.25, 0.40, 0.30]

In [23]:
## Let's first try fine-tuning after pruning
is_Pruned = False
if not is_Pruned:
    model_2 = unstructured_prune_model(model_2, pruning_ratios)
    is_Pruned = True
else:
    print("[+] Model already pruned!")

[+] Pruning completed


<h5> Examine the initial pruning percentage</h5>

In [24]:
total_params, pruned_params, prune_ratio = find_unstructured_prune_ratio(model_2)
total_params, pruned_params, prune_ratio * 100

(3263104, 727606, 22.297971501980935)

In [25]:
for module in model_2.modules(): # Remove the pruning information
    if isinstance(module, torch.nn.Conv2d) and hasattr(module, "weight_orig"):
        module.weight.data = module.weight_orig.data
        del module._forward_pre_hooks['weight']

In [18]:
val_metrics = model_2.val(data='config.yaml')
print("[+] mAP50: ", val_metrics.pose.map50, " mAP50-95: ", val_metrics.pose.map)

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
YOLOv8n-pose summary (fused): 187 layers, 3268931 parameters, 0 gradients, 9.1 GFLOPs


[34m[1mval: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/val.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 75/75 [03:20<00:00,  2.68s/it]


                   all       1200       1711      0.898      0.834      0.893      0.365      0.858      0.782      0.797      0.344
Speed: 0.4ms preprocess, 163.5ms inference, 0.0ms loss, 0.3ms postprocess per image
Results saved to [1mruns/pose/val2[0m
[+] mAP50:  0.7967361492523993  mAP50-95:  0.34383602776180106


<h4> We can see that we mAP50 has dropped from 81.8% to 79.7% and mAP50-95 has dropped from 38% to 34.4% </h4>
<h5> Let's recover it by fine-tuning the pruned model for 1 epoch, we make sure that we keep the learning rate low so that the pruned model can adjust the parameters smoothly, here we take SGD optimizer with initial lr0=1e-7 and lr_final=1e-7 * 1e-6</h5>

<h5> This is done to recover the lost accuracy lost due to pruning</h5>

In [26]:
model_2.train(data='config.yaml', epochs=1, batch=64, optimizer='SGD', lr0=1e-7, lrf=1e-6, momentum=0.005, imgsz=(640, 480), cache=True, dropout=0, cos_lr=True, close_mosaic=1,
                    freeze=21, mosaic=0.0, hsv_h=0.02, hsv_s=0.5, hsv_v=0.4, translate=0.0, scale=0.0, degrees=0)

New https://pypi.org/project/ultralytics/8.1.47 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=pose, mode=train, model=./keypoint_estimation_model/last.pt, data=config.yaml, epochs=1, time=None, patience=100, batch=64, imgsz=(640, 480), save=True, save_period=-1, cache=True, device=None, workers=8, project=None, name=train2, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=True, close_mosaic=1, resume=False, amp=True, fraction=1.0, profile=False, freeze=21, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, sa

[34m[1mtrain: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/train.cache... 2400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2400/2400 [00:00<?, ?it/s]
[34m[1mtrain: [0mCaching images (2.1GB True): 100%|██████████| 2400/2400 [00:00<00:00, 2996.37it/s]

[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01), CLAHE(p=0.01, clip_limit=(1, 4.0), tile_grid_size=(8, 8))



[34m[1mval: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/val.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
[34m[1mval: [0mCaching images (1.0GB True): 100%|██████████| 1200/1200 [00:00<00:00, 2389.13it/s]


Plotting labels to runs/pose/train2/labels.jpg... 
[34m[1moptimizer:[0m SGD(lr=1e-07, momentum=0.005) with parameter groups 63 weight(decay=0.0), 73 weight(decay=0.0005), 72 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/pose/train2[0m
Starting training for 1 epochs...
Closing dataloader mosaic
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01), CLAHE(p=0.01, clip_limit=(1, 4.0), tile_grid_size=(8, 8))

      Epoch    GPU_mem   box_loss  pose_loss  kobj_loss   cls_loss   dfl_loss  Instances       Size


        1/1         0G      2.139      3.109     0.1287     0.8214      2.866         47        640: 100%|██████████| 38/38 [10:26<00:00, 16.48s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [02:37<00:00, 15.70s/it]


                   all       1200       1711      0.891       0.86      0.901      0.386      0.863      0.802      0.807      0.363

1 epochs completed in 0.219 hours.
Optimizer stripped from runs/pose/train2/weights/last.pt, 6.8MB
Optimizer stripped from runs/pose/train2/weights/best.pt, 6.8MB

Validating runs/pose/train2/weights/best.pt...
Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
YOLOv8n-pose summary (fused): 187 layers, 3268931 parameters, 0 gradients, 9.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [02:26<00:00, 14.70s/it]


                   all       1200       1711       0.89       0.86      0.901      0.386      0.863      0.802      0.807      0.363
Speed: 0.7ms preprocess, 119.6ms inference, 0.0ms loss, 0.3ms postprocess per image
Results saved to [1mruns/pose/train2[0m


ultralytics.utils.metrics.PoseMetrics object with attributes:

ap_class_index: array([0])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x2b220f0d0>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)', 'Precision-Recall(P)', 'F1-Confidence(P)', 'Precision-Confidence(P)', 'Recall-Confidence(P)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.0

In [27]:
val_metrics = model_2.val(data='config.yaml')

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
YOLOv8n-pose summary (fused): 187 layers, 3268931 parameters, 0 gradients, 9.1 GFLOPs


[34m[1mval: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/val.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
[34m[1mval: [0mCaching images (1.0GB True): 100%|██████████| 1200/1200 [00:00<00:00, 3067.32it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 19/19 [02:44<00:00,  8.64s/it]


                   all       1200       1711       0.89       0.86      0.901      0.386      0.863      0.802      0.807      0.363
Speed: 0.9ms preprocess, 133.8ms inference, 0.0ms loss, 0.4ms postprocess per image
Results saved to [1mruns/pose/train22[0m


In [28]:
print("[+] mAP50: ", val_metrics.pose.map50, " mAP50-95: ", val_metrics.pose.map)

[+] mAP50:  0.8065850026094209  mAP50-95:  0.3634773682006332


In [32]:
total_params, pruned_params, prune_ratio = find_unstructured_prune_ratio(model_2)
total_params, pruned_params, prune_ratio * 100

(3263104, 722464, 22.14039148001412)

<h5> We can see we have recovered the mAP50 from 79.7% to 80.7% and mAP50-95 from 34.4% to 36.3% by fine tuning the pruned model and the
pruning percentage has barely dropped from 22.297% to 22.140%</h5>

In [18]:
## The model was automatically saved to train2/runs/pose
## Let's copy it to current location

In [41]:
dir_list = get_subdirs_sorted_by_creation_time('./runs/pose')
dir_list = list(filter(is_train_dir, dir_list))
dir_list = list(filter(has_weights, dir_list))
dir_list

['./runs/pose/train', './runs/pose/train2']

In [42]:
pruned_model_path = dir_list[len(dir_list) - 1] + '/weights/last.pt'
pruned_model_path

'./runs/pose/train2/weights/last.pt'

In [43]:
current_datetime = str(datetime.now())
current_datetime

'2024-04-16 17:00:52.428905'

In [63]:
source_path = pruned_model_path
model_name = 'pruned_model_' + current_datetime + '.pt'
destination_path = './pruned_models/' + model_name
source_path, destination_path

('./runs/pose/train2/weights/last.pt',
 './pruned_models/pruned_model_2024-04-16 17:00:52.428905.pt')

In [64]:
copy_file(source_path, destination_path)

File copied successfully from ./runs/pose/train2/weights/last.pt to ./pruned_models/pruned_model_2024-04-16 17:00:52.428905.pt


In [47]:
import gzip

<h5> Due to pruning the pruned model can be compressed more effectively by compression algorithms like gzip </h5>
<h5> Let's compare the sizes of the compressed file of original and pruned model</h5>

In [53]:
## create a compressed models dir if not exists
dir_path = './compressed_models'
create_directory_if_not_exists(dir_path)

Directory './compressed_models' created successfully.


In [61]:
## create a dir for pruned and quantized models
dir_path = './pruned_models'
create_directory_if_not_exists(dir_path)

Directory './pruned_models' created successfully.


In [62]:
dir_path = './pruned_and_quantized_models'
create_directory_if_not_exists(dir_path)

Directory './pruned_and_quantized_models' created successfully.


In [54]:
compress_file('./keypoint_estimation_model/last.pt', './compressed_models/last.gz')

File compressed successfully: ./compressed_models/last.gz


In [65]:
destination_path

'./pruned_models/pruned_model_2024-04-16 17:00:52.428905.pt'

In [60]:
compress_file(destination_path, f"./compressed_models/{model_name[:-3]}.gz")

File compressed successfully: ./compressed_models/pruned_model_2024-04-16 17:00:52.428905.gz


In [67]:
model_name, destination_path

('pruned_model_2024-04-16 17:00:52.428905.pt',
 './pruned_models/pruned_model_2024-04-16 17:00:52.428905.pt')

In [69]:
original_uncompressed_size, pruned_uncompressed_size = get_compressed_file_size('./keypoint_estimation_model/last.pt'), get_compressed_file_size(f'./pruned_models/{model_name}')
original_compressed_size, pruned_compressed_size = get_compressed_file_size('./compressed_models/last.gz'), get_compressed_file_size(f"./compressed_models/{model_name[:-3]}.gz")
original_uncompressed_size, original_compressed_size, pruned_uncompressed_size, pruned_compressed_size

Size of compressed file './keypoint_estimation_model/last.pt': 6783042 bytes
Size of compressed file './pruned_models/pruned_model_2024-04-16 17:00:52.428905.pt': 6778370 bytes
Size of compressed file './compressed_models/last.gz': 6161944 bytes
Size of compressed file './compressed_models/pruned_model_2024-04-16 17:00:52.428905.gz': 5390461 bytes


(6783042, 6161944, 6778370, 5390461)

In [70]:
compression_ratio_original = (original_compressed_size / original_uncompressed_size) * 100
compression_ratio_pruned = (pruned_compressed_size / pruned_uncompressed_size) * 100
compression_ratio_original, compression_ratio_pruned

(90.84337086516639, 79.52444319209485)

In [71]:
reduction_in_size = ((original_compressed_size - pruned_compressed_size) / original_compressed_size) * 100
reduction_in_size

12.520123519460741

<h5> We can see that the original file has compression ratio of only 90.8% while the pruned model has 79.5% compression ratio. 
Also the pruned_compressed file is 12.5% smaller in size than the original_compressed_file </h5>
<h5> All this helps to reduce the download size of model as well as reduce latency when model is transmitted over network</h5>

In [72]:
## Let's compare the inference speed of the original model and pruned model
## We will run inference on 100 images and take the average inference speed for comparision

In [73]:
model_name

'pruned_model_2024-04-16 17:00:52.428905.pt'

In [74]:
model = YOLO('./keypoint_estimation_model/last.pt')
pruned_model = YOLO(f"./pruned_models/{model_name}")

In [76]:
dir_path = '../Project2_Analysis_V_1.6/data/processed/images/val'
entries = os.listdir(dir_path)
random_choices = np.random.randint(low=0, high=len(entries) - 1, size=1000)

<h5>Let's check if we achieved any gains in speed by pruning the model<br>
We do this by comparing the avg indference time for 1000 images by both original and pruned model
</h5>

In [77]:
avg_original_speed = measure_inference_speeds(model, entries, random_choices, dir_path)


image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/051093058.jpg: 480x640 1 human, 70.8ms
Speed: 1.4ms preprocess, 70.8ms inference, 2.4ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/098359730.jpg: 480x640 1 human, 61.6ms
Speed: 0.7ms preprocess, 61.6ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/002634733.jpg: 480x640 1 human, 67.6ms
Speed: 0.6ms preprocess, 67.6ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/017313679.jpg: 480x640 2 humans, 60.7ms
Speed: 0.7ms preprocess, 60.7ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsin

In [78]:
avg_pruned_speed = measure_inference_speeds(pruned_model, entries, random_choices, dir_path)


image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/051093058.jpg: 480x640 1 human, 69.5ms
Speed: 1.3ms preprocess, 69.5ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/098359730.jpg: 480x640 1 human, 65.9ms
Speed: 0.7ms preprocess, 65.9ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/002634733.jpg: 480x640 1 human, 62.7ms
Speed: 0.8ms preprocess, 62.7ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation_0/../Project2_Analysis_V_1.6/data/processed/images/val/017313679.jpg: 480x640 2 humans, 62.9ms
Speed: 0.9ms preprocess, 62.9ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/adsin

In [79]:
print("[+] Original model avg inference time: ", avg_original_speed, " pruned_model inference time: ", avg_pruned_speed)

[+] Original model avg inference time:  [67.3525903224945]  pruned_model inference time:  [67.78613018989563]


It looks like even with pruning we did not achieve any computational efficieny as both original and pruned model have same avg_inference time(in ms). This happened maybe because we need hardware that can take advantage of the sprase matrix multiplication. This test was done on images of dimensions (480, 640)

## Quantisation

We will implement dynamic quantisation by converting the '.pt' model to '.onnx' first and then quantising the '.onnx' model

In [3]:
from torch.quantization import quantize_dynamic
import torch

In [4]:
original_model = YOLO('./keypoint_estimation_model/last.pt')

In [7]:
pruned_model = YOLO(f'./pruned_models/{model_name}')

In [84]:
original_model.export(format='onnx')

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
YOLOv8n-pose summary (fused): 187 layers, 3268931 parameters, 0 gradients, 9.1 GFLOPs

[34m[1mPyTorch:[0m starting from 'keypoint_estimation_model/last.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 53, 8400) (6.5 MB)

[34m[1mONNX:[0m starting export with onnx 1.13.0 opset 17...
[34m[1mONNX:[0m export success ✅ 0.9s, saved as 'keypoint_estimation_model/last.onnx' (12.8 MB)

Export complete (1.3s)
Results saved to [1m/Users/adsingh/Desktop/model_optimisation_0/keypoint_estimation_model[0m
Predict:         yolo predict task=pose model=keypoint_estimation_model/last.onnx imgsz=640  
Validate:        yolo val task=pose model=keypoint_estimation_model/last.onnx imgsz=640 data=config.yaml  
Visualize:       https://netron.app


'keypoint_estimation_model/last.onnx'

In [85]:
pruned_model.export(format='onnx')

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
YOLOv8n-pose summary (fused): 187 layers, 3268931 parameters, 0 gradients, 9.1 GFLOPs

[34m[1mPyTorch:[0m starting from 'pruned_models/pruned_model_2024-04-16 17:00:52.428905.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 53, 8400) (6.5 MB)

[34m[1mONNX:[0m starting export with onnx 1.13.0 opset 17...
[34m[1mONNX:[0m export success ✅ 1.2s, saved as 'pruned_models/pruned_model_2024-04-16 17:00:52.428905.onnx' (12.8 MB)

Export complete (1.5s)
Results saved to [1m/Users/adsingh/Desktop/model_optimisation_0/pruned_models[0m
Predict:         yolo predict task=pose model=pruned_models/pruned_model_2024-04-16 17:00:52.428905.onnx imgsz=640  
Validate:        yolo val task=pose model=pruned_models/pruned_model_2024-04-16 17:00:52.428905.onnx imgsz=640 data=config.yaml  
Visualize:       https://netron.app


'pruned_models/pruned_model_2024-04-16 17:00:52.428905.onnx'

In [8]:
from onnxruntime.quantization import quantize_dynamic, QuantType

In [16]:
model_name

'pruned_model_2024-04-16 17:00:52.428905.pt'

In [14]:
model_fp32 = f"./pruned_models/{model_name[:-3]}.onnx"
model_int8 = 'quantized.onnx'
model_fp32

'./pruned_models/pruned_model_2024-04-16 17:00:52.428905.onnx'

<h5>Quantization in ONNX Runtime refers to 8 bit linear quantization of an ONNX model.
Onnx uses a zero_value and scale factor for quantization</h5>

<h5> Here we're using dynamic quantisation. Dynamic quantization calculates the quantization parameters (scale and zero point) for activations dynamically. These calculations may increase the cost of inference</h5>

In [15]:
quantize_dynamic(model_fp32, model_int8, weight_type=QuantType.QUInt8)

In [29]:
source_path = 'quantized.onnx'
destination_path = './pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx'
source_path, destination_path

('quantized.onnx',
 './pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx')

In [30]:
copy_file(source_path, destination_path)

File copied successfully from quantized.onnx to ./pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx


In [2]:
def remove_file(file_path):
    if os.path.exists(file_path):
    # Delete the file
        os.remove(file_path)
        print(f"File '{file_path}' deleted successfully.")
    else:
        print(f"File '{file_path}' does not exist.")

In [4]:
# Now remove the pruned and quantized model as we have already copied it to "pruned_and_quantized_models" directory
remove_file('quantized.onnx')

File 'quantized.onnx' does not exist.


In [80]:
import onnxruntime as rt
import onnx

In [82]:
quantized_and_pruned_model_path = './pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx'
quantized_and_pruned_model = onnx.load(quantized_and_pruned_model_path)

In [104]:
# Here data_type = 2 means uint8

print("[+] Printing the layers that were quantised")
print("-------------------------------------------------")

total_count, quantized_count = 0, 0
for initializer in quantized_and_pruned_model.graph.initializer:
    if initializer.data_type == 2: # checking if the data_type of weights is uint8, i.e, if they're quantised
        print("Name:", initializer.name)
        print("Data type:", initializer.data_type)
        quantized_count += 1
    if 'conv' in initializer.name:
        total_count += 1

# Dividing by 2 because as we can see in output "model.0.conv.weight_zero_point" and "model.0.conv.weight_quantized", its just that the first one represents
# the zero points associated with the weights of a convolutional layer, while the second one represents the actual quantized weights of the
# convolutional layer, i.e, both of them represent same layer
total_count = total_count / 2
quantized_count = quantized_count / 2

[+] Printing the layers that were quantised
-------------------------------------------------
Name: model.0.conv.weight_zero_point
Data type: 2
Name: model.0.conv.weight_quantized
Data type: 2
Name: model.1.conv.weight_zero_point
Data type: 2
Name: model.1.conv.weight_quantized
Data type: 2
Name: model.2.cv1.conv.weight_zero_point
Data type: 2
Name: model.2.cv1.conv.weight_quantized
Data type: 2
Name: model.2.m.0.cv1.conv.weight_zero_point
Data type: 2
Name: model.2.m.0.cv1.conv.weight_quantized
Data type: 2
Name: model.2.m.0.cv2.conv.weight_zero_point
Data type: 2
Name: model.2.m.0.cv2.conv.weight_quantized
Data type: 2
Name: model.2.cv2.conv.weight_zero_point
Data type: 2
Name: model.2.cv2.conv.weight_quantized
Data type: 2
Name: model.3.conv.weight_zero_point
Data type: 2
Name: model.3.conv.weight_quantized
Data type: 2
Name: model.4.cv1.conv.weight_zero_point
Data type: 2
Name: model.4.cv1.conv.weight_quantized
Data type: 2
Name: model.4.m.0.cv1.conv.weight_zero_point
Data type: 2


In [105]:
print("[+] Total Conv layers: ", total_count, " quantized layers: ", quantized_count)

[+] Total Conv layers:  159.0  quantized layers:  73.0


In [32]:
model_name

'pruned_model_2024-04-16 17:00:52.428905.pt'

In [37]:
original_model = YOLO('./keypoint_estimation_model/last.onnx', task='pose')
pruned_model = YOLO(f'./pruned_models/{model_name[:-3]}.onnx', task='pose')
quantized_and_pruned_model = YOLO('./pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx', task='pose')

In [38]:
val_metrics = original_model.val(data='config.yaml')

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
Loading keypoint_estimation_model/last.onnx for ONNX Runtime inference...
Forcing batch=1 square inference (1,3,640,640) for non-PyTorch models


[34m[1mval: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/val.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 1200/1200 [01:01<00:00, 19.54it/s]


                   all       1200       1711      0.891       0.87      0.909      0.389      0.861      0.819      0.819      0.393
Speed: 0.7ms preprocess, 46.0ms inference, 0.0ms loss, 0.7ms postprocess per image
Results saved to [1mruns/pose/val5[0m


In [39]:
print("[+] Original model mAP50: ", val_metrics.pose.map50, " mAP50-95: ", val_metrics.pose.map)

[+] Original model mAP50:  0.8187716179495923  mAP50-95:  0.3931364949342392


In [40]:
val_metrics = pruned_model.val(data='config.yaml')

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
Loading pruned_models/pruned_model_2024-04-16 17:00:52.428905.onnx for ONNX Runtime inference...
Forcing batch=1 square inference (1,3,640,640) for non-PyTorch models


[34m[1mval: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/val.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 1200/1200 [01:03<00:00, 18.97it/s]


                   all       1200       1711      0.907      0.864      0.913      0.404      0.861      0.786      0.799      0.371
Speed: 0.7ms preprocess, 47.6ms inference, 0.0ms loss, 0.7ms postprocess per image
Results saved to [1mruns/pose/val6[0m


In [41]:
print("[+] Pruned but not quantized model mAP50: ", val_metrics.pose.map50, " mAP50-95: ", val_metrics.pose.map)

[+] Pruned but not quantized model mAP50:  0.7985162750263713  mAP50-95:  0.3706975042217301


In [42]:
val_metrics = quantized_and_pruned_model.val(data='config.yaml')

Ultralytics YOLOv8.1.27 🚀 Python-3.10.13 torch-2.2.1 CPU (Apple M1 Pro)
Loading pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx for ONNX Runtime inference...
Forcing batch=1 square inference (1,3,640,640) for non-PyTorch models


[34m[1mval: [0mScanning /Users/adsingh/Desktop/Project2_Analysis_V_1.6/data/processed/labels/val.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 1200/1200 [01:35<00:00, 12.54it/s]


                   all       1200       1711      0.904      0.853      0.901      0.401      0.859      0.788      0.794      0.359
Speed: 0.7ms preprocess, 74.6ms inference, 0.0ms loss, 0.6ms postprocess per image
Results saved to [1mruns/pose/val7[0m


In [44]:
print("[+] Pruned and Quantized model mAP50: ", val_metrics.pose.map50, " mAP50-95: ", val_metrics.pose.map)

[+] Pruned and Quantized model mAP50:  0.794315776905753  mAP50-95:  0.3585422048599568


Let's also compare the size of the compressed files of the onnx formats for all three cases: original, pruned, pruned and quantized

In [48]:
compress_file('./keypoint_estimation_model/last.onnx', './compressed_models/last_onnx.gz')

File compressed successfully: ./compressed_models/last_onnx.gz


In [49]:
model_name

'pruned_model_2024-04-16 17:00:52.428905.pt'

In [50]:
compress_file(f'./pruned_models/{model_name[:-3]}.onnx', './compressed_models/pruned_model_onnx.gz')

File compressed successfully: ./compressed_models/pruned_model_onnx.gz


In [52]:
compress_file(f'./pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx', './compressed_models/pruned_quantized_pose_model_onnx.gz')

File compressed successfully: ./compressed_models/pruned_quantized_pose_model_onnx.gz


In [53]:
model_name

'pruned_model_2024-04-16 17:00:52.428905.pt'

In [54]:
orginal_onnx_uncompressed, orginal_onnx_compressed = get_compressed_file_size('./keypoint_estimation_model/last.onnx'), get_compressed_file_size('./compressed_models/last_onnx.gz')
pruned_onnx_uncompressed, pruned_onnx_compressed = get_compressed_file_size(f'./pruned_models/{model_name[:-3]}.onnx'), get_compressed_file_size('./compressed_models/pruned_model_onnx.gz')
pruned_and_quantized_onnx_uncompressed, pruned_and_quantized_onnx_compressed = get_compressed_file_size('./pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx'), get_compressed_file_size('./compressed_models/pruned_quantized_pose_model_onnx.gz')

Size of compressed file './keypoint_estimation_model/last.onnx': 13399984 bytes
Size of compressed file './compressed_models/last_onnx.gz': 11441819 bytes
Size of compressed file './pruned_models/pruned_model_2024-04-16 17:00:52.428905.onnx': 13399984 bytes
Size of compressed file './compressed_models/pruned_model_onnx.gz': 9590449 bytes
Size of compressed file './pruned_and_quantized_models/pruned_and_quantized_pose_estimation_model.onnx': 3732262 bytes
Size of compressed file './compressed_models/pruned_quantized_pose_model_onnx.gz': 2288678 bytes


In [55]:
compression_ratio_original = (orginal_onnx_compressed / orginal_onnx_uncompressed) * 100
compression_ratio_pruned = (pruned_onnx_compressed / pruned_onnx_uncompressed) * 100
compression_ratio_pruned_and_quantized = (pruned_and_quantized_onnx_compressed / pruned_and_quantized_onnx_uncompressed) * 100
compression_ratio_original, compression_ratio_pruned, compression_ratio_pruned_and_quantized

(85.38681090962497, 71.57060038280643, 61.321472072432215)

Let's see the final reduction in size

In [56]:
final_size_reduction_percentage = ((orginal_onnx_compressed - pruned_and_quantized_onnx_compressed) / orginal_onnx_compressed) * 100
final_size_reduction_percentage

79.99725393313773

## Summary of results

<h5> Original onnx model: mAP50-> 81.97, mAP50-95-> 39.31, compression_ratio(onnx format)->85.38%, size(onnx): 11.4MB
<br>
Pruned but not quantized onnx model: mAP50-> 79.8%, mAP50-95-> 37.0%, compression_ratio(onnx format)->71.57%, size(onnx): 9.6MB
<br>
Pruned an also quantized onnx model: mAP50-> 79.4%, mAP50-95-> 35.9%, compression_ratio(onnx format)->61.57%, size(onnx): 2.3MB
<br>
Percentage Reduction in size at last step: 79.99% 
</h5>

In [168]:
dir_path = '../Project2_Analysis_V_1.6/data/processed/images/val'
entries = os.listdir(dir_path)
random_choices = np.random.randint(low=0, high=len(entries) - 1, size=500)

In [170]:
avg_original_speed = measure_inference_speeds(original_model, entries, random_choices, dir_path)

Loading last.onnx for ONNX Runtime inference...

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/005515659.jpg: 640x640 1 human, 132.1ms
Speed: 2.7ms preprocess, 132.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/056573196.jpg: 640x640 1 human, 52.7ms
Speed: 2.9ms preprocess, 52.7ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/034827462.jpg: 640x640 1 human, 51.5ms
Speed: 3.1ms preprocess, 51.5ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/050479560.jpg: 640x640 1 human, 50.6ms
Speed: 2.3ms preprocess, 50.6ms inference, 0.5ms postprocess per image at shape

In [171]:
avg_pruned_and_quantized_speed = measure_inference_speeds(quantized_and_pruned_model, entries, random_choices, dir_path)

Loading quantized.onnx for ONNX Runtime inference...

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/005515659.jpg: 640x640 1 human, 81.3ms
Speed: 1.8ms preprocess, 81.3ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/056573196.jpg: 640x640 1 human, 90.9ms
Speed: 1.5ms preprocess, 90.9ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/034827462.jpg: 640x640 1 human, 87.6ms
Speed: 1.4ms preprocess, 87.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /Users/adsingh/Desktop/model_optimisation/../Project2_Analysis_V_1.6/data/processed/images/val/050479560.jpg: 640x640 1 human, 78.9ms
Speed: 0.9ms preprocess, 78.9ms inference, 0.6ms postprocess per image at sh

In [172]:
avg_original_speed, avg_pruned_and_quantized_speed

([53.63247561454773], [78.730140209198])

This is weird because the inference time(in ms) has actually increased even though we quantized our model. This test was done on (480, 640) dimension images.