In [1]:
import os
import torch
import torch.nn as nn
from datetime import datetime
from obd1.models import yolo_base,ann_new_model_simple
from lava.lib.dl import slayer
from torch.utils.data import DataLoader
from obd1.dataset import evCIVIL,visualize_data
from obd1.boundingbox import metrics,utils
from torch.ao.quantization import QuantStub, DeQuantStub

         Only Prophesee DVS demo will not run properly.
         Please install it from https://github.com/prophesee-ai/prophesee-automotive-dataset-toolbox


In [2]:
def load_model_only_with_necessary_keys_from_state_dict(model,pretained_ckpt_path,is_full_checkpoint = True):

    # Load the state dictionary from a pre-trained checkpoint
    pretrained_full_checkpoint = torch.load(pretained_ckpt_path)
    if is_full_checkpoint:
        pretrained_checkpoint = pretrained_full_checkpoint["model_state_dict"]
    else:
        pretrained_checkpoint = pretrained_full_checkpoint

    # Get the current state dictionary of the model
    model_state_dict = model.state_dict()

    # Create a new state dictionary to load the weights
    updated_state_dict = {}

    # Iterate over the keys in the pre-trained state dictionary
    for key in pretrained_checkpoint.keys():
        if key in model_state_dict and pretrained_checkpoint[key].size() == model_state_dict[key].size():
     
            #if torch.any(pretrained_checkpoint[key] > 4):
            #    print(key) #"     ",pretrained_checkpoint[key])

            updated_state_dict[key] = pretrained_checkpoint[key]
        else:
            print(f"Skipping {key} due to mismatch in size or missing key.")

    all_tensors = torch.cat([v.view(-1) for v in updated_state_dict.values()])

    # Find the maximum value
    max_value = torch.max(all_tensors)

    # Find the minimum value
    min_value = torch.min(all_tensors)
    
    # Update the state dictionary of the model
    model_state_dict.update(updated_state_dict)

    print("max value of the state_dict ",max_value)
    print("min value of the state dict ",min_value)
    print("average value of state dict ",torch.mean(all_tensors))

    # Load the updated state dictionary into the model
    model.load_state_dict(model_state_dict)


    # Verify if the model weights have been loaded correctly
    for key in updated_state_dict.keys():
        assert torch.equal(model.state_dict()[key], updated_state_dict[key]), f"Weights not loaded correctly for {key}"

    print("Model loaded with available pre-trained weights.")

In [3]:
def validate_gradients(model) -> None:
        """Validate gradients of the network and circumvent if any.
        """
        valid_gradients = True
        for name, param in model.named_parameters():
            if param.grad is not None:
                valid_gradients = not (torch.isnan(param.grad).any()
                                       or torch.isinf(param.grad).any())
                if not valid_gradients:
                    break
        if not valid_gradients:
            model.zero_grad()

In [4]:
def _yolo(x: torch.tensor,
          anchors: torch.tensor,
          clamp_max: float = 5.0) -> torch.tensor:
    # converts raw predictions to bounding box predictions.
    _, _, H, W, _, _ = x.shape
    range_y, range_x = torch.meshgrid(
        torch.arange(H, dtype=x.dtype, device=x.device),
        torch.arange(W, dtype=x.dtype, device=x.device),
        indexing='ij',
    )
    anchor_x, anchor_y = anchors[:, 0], anchors[:, 1]

    x_center = (torch.sigmoid(x[:, :, :, :, 0:1, :])
                + range_x[None, None, :, :, None, None]) / W
    y_center = (torch.sigmoid(x[:, :, :, :, 1:2, :])
                + range_y[None, None, :, :, None, None]) / H
    width = (torch.exp(
        x[:, :, :, :, 2:3, :].clamp(
            max=clamp_max)) * anchor_x[None, :, None, None, None, None]) / W
    height = (torch.exp(
        x[:, :, :, :, 3:4, :].clamp(
            max=clamp_max)) * anchor_y[None, :, None, None, None, None]) / H
    confidence = torch.sigmoid(x[:, :, :, :, 4:5, :])
    classes = torch.softmax(x[:, :, :, :, 5:, :], dim=-2)

    x = torch.concat([x_center, y_center, width, height,
                      confidence, classes], dim=-2)

    if torch.isnan(x).any() or torch.isinf(x).any():
        print(f'{torch.isnan(x_center).any()=}')
        print(f'{torch.isinf(x_center).any()=}')
        print(f'{torch.isnan(y_center).any()=}')
        print(f'{torch.isinf(y_center).any()=}')
        print(f'{torch.isnan(width).any()=}')
        print(f'{torch.isinf(width).any()=}')
        print(f'{torch.isnan(height).any()=}')
        print(f'{torch.isinf(height).any()=}')
        raise RuntimeError('Ecountered NaN and Inf!')

    return x  # batch, anchor, height, width, predictions, time 



def yolo(x: torch.tensor, anchors: torch.tensor) -> torch.tensor:
        """Evaluates YOLO bounding box prediction from raw network output.

        Parameters
        ----------
        x : torch.tensor
            Raw prediciton tensor.
        anchors : torch.tensor
            Anchors associated with the prediction head.

        Returns
        -------
        torch.tensor
            Output bounding boxes.
        """
        clamp_max = 5.0
        N, _, _, _, P, T = x.shape
        return _yolo(x, anchors, clamp_max).reshape([N, -1, P, T])

def yolo_raw(x: torch.tensor) -> torch.tensor:
        """Transforms raw YOLO prediction to eventual output order i.e.
        NCHWT order to (batch, num_anchors, num_outputs, height, width, time).

        Parameters
        ----------
        x : torch.tensor
            Raw prediction output of the network.

        Returns
        -------
        torch.tensor
            Transformed raw prediction output for a head.
        """
        num_anchors = 3

        N, _, H, W = x.shape
        return x.reshape(N,
                         num_anchors,
                         -1, H, W).permute(0, 1, 3, 4, 2)
        #return x.reshape(N,num_anchors,-1, H, W, T).permute(0, 1, 3, 4, 2, 5)

        """T, N, _, H, W = x.shape
        return x.reshape(T,N,
                         num_anchors,
                         -1, H, W).permute(1, 2, 4, 5, 3, 0) # now (batch,anchors,h,w,-1,T)"""

In [5]:
dataset_path = '/home/udayanga/latest_dataset/'
train_csv_file = "night_outdoor_and_daytime_train_files_event_based.txt"
test_csv_file = "test_files_event_based.txt"
param_dict = {"TSteps" : 7, "tbins" : 1 ,"quantized_h" : 260 ,"quantized_w" : 346}

In [6]:
in_channels = 2
num_anchors = 3
num_classes = 2
num_output = num_anchors * (5 + num_classes)
tgt_iou_thr = 0.5
conf_thres = 0.1

In [11]:
pretrained_ckpt_path = "./checkpoints/epoch_447_0.3544013564001861.pt"

In [12]:
net_dummy = ann_new_model_simple.Network(in_channels=in_channels,num_classes=2,clamp_max=5.)
#net_dummy.to(torch.device("cuda:0"))
net_dummy.init_model((240,320),2*1)
#full_ckpt = torch.load(pretrained_ckpt_path,map_location=torch.device("cuda:0"))
#net_dummy.load_state_dict(full_ckpt["model_state_dict"])
#load_model_only_with_necessary_keys_from_state_dict(net_dummy,pretrained_ckpt_path,is_full_checkpoint = True)

In [7]:
#net = ann_new_model.Network(in_channels=in_channels,num_classes=2,clamp_max=5.)
#N,C,H,W = 8,2,320,320
#input = torch.rand(N,C, H, W)
#net(input)

In [13]:
yolo_target = yolo_base.YOLOtarget(anchors= net_dummy.anchors,
                                 scales= net_dummy.scale,
                                 num_classes=net_dummy.num_classes,
                                 ignore_iou_thres=tgt_iou_thr)

In [14]:
train_set = evCIVIL.evCIVIL(root = dataset_path ,csv_file_name = train_csv_file , param_dict = param_dict, train = True,augment = True)
test_set = evCIVIL.evCIVIL(root = dataset_path, csv_file_name= test_csv_file, param_dict = param_dict, train = False, augment = False)

train_loader = DataLoader(train_set,
                                batch_size=8,
                                shuffle=True,
                                collate_fn=yolo_target.collate_fn,
                                num_workers=1,
                                pin_memory=True)
    
test_loader = DataLoader(test_set,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=yolo_target.collate_fn,    
                                num_workers=1,
                                pin_memory=True)

In [15]:
visualize_save_path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/visualize_image_save"
conf_thres = 0.76
stats = slayer.utils.LearningStats(accuracy_str='AP@0.5')
ap_stats = metrics.APstats(iou_threshold=0.5)
stats.update()

epoch = 0
t_st = datetime.now()
net_dummy.eval()

with torch.no_grad():
        
        for i, (inputs, targets, bboxes) in enumerate(test_loader):

                #start
                inputs = inputs.permute(4,0,1,2,3)
          
                inputs = inputs.squeeze(0)

                inputs = inputs.to(torch.device('cuda:0'))
                predictions = net_dummy(inputs)

      
                predictions = [prediction.unsqueeze(-1) for prediction in predictions]
                
                #predictions = [torch.sum(prediction,dim=-1).unsqueeze(-1) for prediction in predictions]

                # MAP calculations
                T = 1 #inputs.shape[-1]
                try:
                    predictions = torch.concat([net_dummy.yolo(p, a) for (p, a)
                        in zip(predictions, net_dummy.anchors)],dim=1)
                except RuntimeError:
                    print('Runtime error on MAP predictions calculation.'
                            'continuing')
                    continue

                predictions = [utils.nms(predictions[..., t],conf_threshold = conf_thres)
                                for t in range(T)]
                
                print("predictions shape ",predictions[0][0].shape)
                print("bboxes shape ",bboxes[0][0].shape)
                print("inputs shape ",inputs.shape)

                visualize_data.dump_image_with_labels(inputs.squeeze(0).permute(1,2,0).cpu(),predictions[0][0].cpu().numpy(),(240,320),visualize_save_path,i,create_histo_frames=False)
                for t in range(T):
                    ap_stats.update(predictions[t], bboxes[t])
                    #end

                stats.testing.num_samples += inputs.shape[0]
                stats.testing.correct_samples = ap_stats[:] * stats.testing.num_samples

                processed = i * test_loader.batch_size
                total = len(test_loader.dataset)
                time_elapsed = (datetime.now() - t_st).total_seconds()
                samples_sec = time_elapsed / (i + 1) / test_loader.batch_size
                header_list = [f'Test: [{processed}/{total} '
                                    f'({100.0 * processed / total:.0f}%)]']
                stats.print(epoch, i, samples_sec, header=header_list)
                #functional.reset_net(net)
                break

        print("Loss/train", stats.training.loss,epoch)
        print('mAP@50/train',stats.training.accuracy,epoch)
        print('mAP@50/test', stats.testing.accuracy, epoch)

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [11]:
#### The following is the net that will be quantized.... ####

In [16]:
class QuantNetwork(torch.nn.Module):

    def __init__(self,
                    in_channels = 4,
                    num_output = num_output):
        
        super().__init__()
        
        self.num_output = num_output
        
        self.backend_blocks = torch.nn.ModuleList([
            
            nn.Conv2d(in_channels, 16, kernel_size=3, padding=1, stride=2, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            
            nn.Conv2d(16, 32, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2,stride = 2),
            #nn.Dropout(0.2),

            nn.Conv2d(32, 32, kernel_size=3, padding=1, stride=2, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            
            nn.Conv2d(32, 32, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2,stride = 2),
            #nn.Dropout(0.2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=2, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        
        ])

        self.head1_backend = torch.nn.ModuleList([

            nn.Conv2d(64,64, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            #nn.Dropout(0.2),

            nn.Conv2d(64,128, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            #nn.Dropout(0.2),
            
        ])

        self.head1_blocks = torch.nn.ModuleList([

            nn.Conv2d(128,128, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            #nn.Dropout(0.2),

            nn.Conv2d(128,self.num_output, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(self.num_output),
            nn.ReLU(),

        ])

        self.quant = QuantStub()
        self.dequant = DeQuantStub()

        """self.backend_blocks[1].synapse.weight.requires_grad = False #pool
        self.backend_blocks[6].synapse.weight.requires_grad = False #pool
        self.head2_backend[2].synapse.weight.requires_grad = False #unpool
        self.head2_backend[3].synapse.weight.requires_grad = False"""
    
    def forward(self,
        input
    ):

        count = []
        """for block in self.input_blocks:
            input = block(input)
            count.append(slayer.utils.event_rate(input))
            
        backend = input"""
        input = self.quant(input)
        backend = input
        for block in self.backend_blocks:
            backend = block(backend)
        
        h1_backend = backend
        for block in self.head1_backend:
            h1_backend = block(h1_backend)
        
        head1 = h1_backend
        for block in self.head1_blocks:
            head1 = block(head1)
            """if has_sparisty_loss and isinstance(block,
                                                slayer.block.sigma_delta.Conv):
                sparsity_monitor.append(head1)"""

            """if has_sparisty_loss and isinstance(block,
                                                slayer.block.sigma_delta.Conv):
                sparsity_monitor.append(head2)"""

        ## Get the summation or averaging over all the time steps, and preserve the last dimension of head1 tensor as time but Time = 1

        head1 = self.dequant(head1)

        return head1
    
    def fuse_model(self,quant_aware_train = False):
        # Specify which layers to fuse
        if quant_aware_train:
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU
            # Pooling layers do not need to be fused
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['7', '8', '9'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['10', '11', '12'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['14', '15', '16'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules_qat(self.head1_backend, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.head1_backend, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules_qat(self.head1_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.head1_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU
        
        else:
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU
            # Pooling layers do not need to be fused
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['7', '8', '9'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['10', '11', '12'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['14', '15', '16'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules(self.head1_backend, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.head1_backend, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules(self.head1_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.head1_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU

In [12]:
import os
os.getcwd()

'/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240'

In [20]:
net = QuantNetwork(in_channels=in_channels,num_output=num_output)
#net.to(torch.device("cuda:0"))
pretrained_ckpt_path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_model/qat_checkpoints/epoch_145_0.3345367681206545.pt" #"./checkpoints/epoch_447_0.3544013564001861.pt"
load_model_only_with_necessary_keys_from_state_dict(net,pretrained_ckpt_path,is_full_checkpoint = True)
net.qconfig = torch.ao.quantization.get_default_qconfig('qnnpack')
net.to(torch.device("cpu"))
net.eval()
net.fuse_model(quant_aware_train = False)
net = torch.ao.quantization.prepare(net)
#N,C,H,W = 1,2,320,320
#input = torch.rand(N,C, H, W)
#net(input)

Skipping backend_blocks.0.bn.weight due to mismatch in size or missing key.
Skipping backend_blocks.0.bn.bias due to mismatch in size or missing key.
Skipping backend_blocks.0.bn.running_mean due to mismatch in size or missing key.
Skipping backend_blocks.0.bn.running_var due to mismatch in size or missing key.
Skipping backend_blocks.0.bn.num_batches_tracked due to mismatch in size or missing key.
Skipping backend_blocks.0.weight_fake_quant.fake_quant_enabled due to mismatch in size or missing key.
Skipping backend_blocks.0.weight_fake_quant.observer_enabled due to mismatch in size or missing key.
Skipping backend_blocks.0.weight_fake_quant.scale due to mismatch in size or missing key.
Skipping backend_blocks.0.weight_fake_quant.zero_point due to mismatch in size or missing key.
Skipping backend_blocks.0.weight_fake_quant.activation_post_process.eps due to mismatch in size or missing key.
Skipping backend_blocks.0.weight_fake_quant.activation_post_process.min_val due to mismatch in si

  pretrained_full_checkpoint = torch.load(pretained_ckpt_path)


In [21]:
#to identify the limits for activation quantization.
with torch.no_grad():
    for i, (inputs, targets, bboxes) in enumerate(train_loader):
        # Pass a batch of images through the model
        inputs = inputs.permute(4,0,1,2,3)
        inputs = inputs.squeeze(0)
        inputs = inputs.to('cpu')
        predictions = net(inputs)

In [22]:
for key in net.state_dict().keys():
    print(key)

backend_blocks.0.0.weight
backend_blocks.0.0.bias
backend_blocks.0.activation_post_process.eps
backend_blocks.0.activation_post_process.histogram
backend_blocks.0.activation_post_process.min_val
backend_blocks.0.activation_post_process.max_val
backend_blocks.3.0.weight
backend_blocks.3.0.bias
backend_blocks.3.activation_post_process.eps
backend_blocks.3.activation_post_process.histogram
backend_blocks.3.activation_post_process.min_val
backend_blocks.3.activation_post_process.max_val
backend_blocks.7.0.weight
backend_blocks.7.0.bias
backend_blocks.7.activation_post_process.eps
backend_blocks.7.activation_post_process.histogram
backend_blocks.7.activation_post_process.min_val
backend_blocks.7.activation_post_process.max_val
backend_blocks.10.0.weight
backend_blocks.10.0.bias
backend_blocks.10.activation_post_process.eps
backend_blocks.10.activation_post_process.histogram
backend_blocks.10.activation_post_process.min_val
backend_blocks.10.activation_post_process.max_val
backend_blocks.14.

In [23]:
model_int8 = torch.ao.quantization.convert(net)
model_int8.eval()

QuantNetwork(
  (backend_blocks): ModuleList(
    (0): QuantizedConvReLU2d(2, 16, kernel_size=(3, 3), stride=(2, 2), scale=1.8660508394241333, zero_point=0, padding=(1, 1))
    (1-2): 2 x Identity()
    (3): QuantizedConvReLU2d(16, 32, kernel_size=(3, 3), stride=(1, 1), scale=17.294429779052734, zero_point=0, padding=(1, 1))
    (4-5): 2 x Identity()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): QuantizedConvReLU2d(32, 32, kernel_size=(3, 3), stride=(2, 2), scale=329.8655090332031, zero_point=0, padding=(1, 1))
    (8-9): 2 x Identity()
    (10): QuantizedConvReLU2d(32, 32, kernel_size=(3, 3), stride=(1, 1), scale=85.06742095947266, zero_point=0, padding=(1, 1))
    (11-12): 2 x Identity()
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): QuantizedConvReLU2d(32, 64, kernel_size=(3, 3), stride=(2, 2), scale=43.28719711303711, zero_point=0, padding=(1, 1))
    (15-16): 2 x Identity()
  )
  (head1_

In [122]:
"""with torch.no_grad():
    for i, (inputs, targets, bboxes) in enumerate(train_loader):
        # Pass a batch of images through the model
        inputs = inputs.permute(4,0,1,2,3)
        inputs = inputs.squeeze(0)
        inputs = inputs.to('cpu')
        predictions = net(inputs)"""

In [13]:
#pretrained_model_path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_model/epoch_386_0.31518240416279664.pt"
#pretrained_model_info = torch.load(pretrained_model_path,map_location='cpu')
#model_state_dict = pretrained_model_info["model_state_dict"]
#net.load_state_dict(model_state_dict)
#net.to(torch.device("cuda:0"))
#load_model_only_with_necessary_keys_from_state_dict(net,pretrained_model_path,is_full_checkpoint = True)

In [57]:
"""net.qconfig = torch.ao.quantization.get_default_qconfig('qnnpack')
net.fuse_model()
net_prepared = torch.ao.quantization.prepare(net)
with torch.no_grad():
    for i, (inputs, targets, bboxes) in enumerate(train_loader):
        # Pass a batch of images through the model
        inputs = inputs.permute(4,0,1,2,3)
        inputs = inputs.squeeze(0)
        inputs = inputs.to('cpu')
        predictions = net_prepared(inputs)
temp = torch.load("/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_model/qat_checkpoints/epoch_145_0.3345367681206545.pt")
net_prepared.load_state_dict(temp["model_state_dict"])
model_int8 = torch.ao.quantization.convert(net_prepared)
model_int8.eval()"""

'net.qconfig = torch.ao.quantization.get_default_qconfig(\'qnnpack\')\nnet.fuse_model()\nnet_prepared = torch.ao.quantization.prepare(net)\nwith torch.no_grad():\n    for i, (inputs, targets, bboxes) in enumerate(train_loader):\n        # Pass a batch of images through the model\n        inputs = inputs.permute(4,0,1,2,3)\n        inputs = inputs.squeeze(0)\n        inputs = inputs.to(\'cpu\')\n        predictions = net_prepared(inputs)\ntemp = torch.load("/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_model/qat_checkpoints/epoch_145_0.3345367681206545.pt")\nnet_prepared.load_state_dict(temp["model_state_dict"])\nmodel_int8 = torch.ao.quantization.convert(net_prepared)\nmodel_int8.eval()'

In [24]:
model_int8.backend_blocks[0].weight

<bound method Conv2d.weight of QuantizedConvReLU2d(2, 16, kernel_size=(3, 3), stride=(2, 2), scale=1.8660508394241333, zero_point=0, padding=(1, 1))>

In [25]:
print(model_int8.backend_blocks[0].weight()) 

tensor([[[[ 0.2445,  0.2795,  0.0838],
          [ 0.3773, -0.0279, -0.1258],
          [ 0.0908, -0.3144, -0.5100]],

         [[ 0.3983, -0.0070, -0.0489],
          [ 0.2445,  0.0140, -0.1677],
          [ 0.0210, -0.2166, -0.4611]]],


        [[[-0.1467,  0.3074,  0.1677],
          [ 0.0279,  0.0838, -0.0140],
          [-0.0769,  0.7057,  0.2515]],

         [[-0.2655,  0.2865,  0.1677],
          [-0.3843,  0.1467,  0.1397],
          [-0.4262,  0.3214,  0.3074]]],


        [[[-0.1607, -0.2166, -0.1467],
          [-0.1886, -0.2865, -0.2515],
          [-0.1397, -0.2585, -0.1537]],

         [[ 0.1886,  0.2096,  0.1956],
          [ 0.1956,  0.3214,  0.2376],
          [ 0.1258,  0.2236,  0.1607]]],


        [[[ 0.2236,  0.1048,  0.1258],
          [-0.2166,  0.3214,  0.4402],
          [-0.1607, -0.2096,  0.2934]],

         [[-0.0070, -0.3703,  0.1258],
          [-0.4821, -0.3214,  0.0629],
          [-0.2236, -0.4402, -0.0489]]],


        [[[ 0.5100,  0.5310,  0.2445],
 

In [26]:
stats = slayer.utils.LearningStats(accuracy_str='AP@0.5')
ap_stats = metrics.APstats(iou_threshold=0.5)
stats.update()
epoch = 0
t_st = datetime.now()
with torch.no_grad():
        
        for i, (inputs, targets, bboxes) in enumerate(test_loader):
                
                """"inputs = inputs.permute(4,0,1,2,3)
                inputs = inputs.to(device)
                predictions, counts = net(inputs)

                T = 1
                predictions = [utils.nms(predictions[..., t],conf_threshold = args.conf_thres)
                    for t in range(T)]
                for t in range(T):
                    ap_stats.update(predictions[t], bboxes[t])"""

                #start
                inputs = inputs.permute(4,0,1,2,3)
          
                inputs = inputs.squeeze(0)

                inputs = inputs.to('cpu')

                predictions = model_int8(inputs)

                print("shapeeeeeeeeeeeeeee ",predictions.shape)
                

                predictions = yolo_raw(predictions)
                predictions = [predictions]

                print("predictionss .... ",predictions[0].shape)
      
                predictions = [prediction.unsqueeze(-1) for prediction in predictions]
                
                #predictions = [torch.sum(prediction,dim=-1).unsqueeze(-1) for prediction in predictions]
                
                #loss, loss_distr = yolo_loss(predictions, targets)

                # MAP calculations
                T = 1 #inputs.shape[-1]
                try:
                    predictions = torch.concat([yolo(p, a) for (p, a)
                        in zip(predictions, net_dummy.anchors)],dim=1)
                except RuntimeError:
                    print('Runtime error on MAP predictions calculation.'
                            'continuing')
                    continue

                predictions = [utils.nms(predictions[..., t],conf_threshold = conf_thres)
                                for t in range(T)]
                for t in range(T):
                    ap_stats.update(predictions[t], bboxes[t])
                    #end

                stats.testing.num_samples += inputs.shape[0]
                stats.testing.correct_samples = ap_stats[:] * stats.testing.num_samples

                processed = i * test_loader.batch_size
                total = len(test_loader.dataset)
                time_elapsed = (datetime.now() - t_st).total_seconds()
                samples_sec = time_elapsed / (i + 1) / test_loader.batch_size
                header_list = [f'Test: [{processed}/{total} '
                                    f'({100.0 * processed / total:.0f}%)]']
  
                stats.print(epoch, i, samples_sec, header=header_list)
                #functional.reset_net(net)

        print('mAP@50/train',stats.training.accuracy,epoch)
        print('mAP@50/test', stats.testing.accuracy, epoch)

shapeeeeeeeeeeeeeee  torch.Size([1, 21, 8, 10])
predictionss ....  torch.Size([1, 3, 8, 10, 7])
[0A
Epoch    0: i =     0 ,      53.7860 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.00000 
shapeeeeeeeeeeeeeee  torch.Size([1, 21, 8, 10])
predictionss ....  torch.Size([1, 3, 8, 10, 7])
[5A
Epoch    0: i =     1 ,      29.2055 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.00750 
shapeeeeeeeeeeeeeee  torch.Size([1, 21, 8, 10])
predictionss ....  torch.Size([1, 3, 8, 10, 7])
[5A
Epoch    0: i =     2 ,      20.6877 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.01000 
shapeeeeeeeeeeeeeee  torch.Size([1, 21, 8, 10])
predictionss ....  torch.Size([1, 3, 8, 10, 7])
[5A
Epoch    0: i =     3 ,      16.5920 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.01125 
shapeeeeeeeeeeeeeee  torch.Size([1, 21, 8, 10])
predictionss ....  t

In [30]:
os.getcwd()

'/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240'

In [28]:
model_filepath = "./torch_qat_quantized_jit_240_320.pth"
torch.jit.save(torch.jit.script(model_int8), model_filepath)

In [29]:
loaded_jit_model = torch.load(model_filepath)
loaded_jit_model.to(torch.device('cpu'))

  loaded_jit_model = torch.load(model_filepath)


RecursiveScriptModule(
  original_name=QuantNetwork
  (backend_blocks): RecursiveScriptModule(
    original_name=ModuleList
    (0): RecursiveScriptModule(original_name=ConvReLU2d)
    (1): RecursiveScriptModule(original_name=Identity)
    (2): RecursiveScriptModule(original_name=Identity)
    (3): RecursiveScriptModule(original_name=ConvReLU2d)
    (4): RecursiveScriptModule(original_name=Identity)
    (5): RecursiveScriptModule(original_name=Identity)
    (6): RecursiveScriptModule(original_name=MaxPool2d)
    (7): RecursiveScriptModule(original_name=ConvReLU2d)
    (8): RecursiveScriptModule(original_name=Identity)
    (9): RecursiveScriptModule(original_name=Identity)
    (10): RecursiveScriptModule(original_name=ConvReLU2d)
    (11): RecursiveScriptModule(original_name=Identity)
    (12): RecursiveScriptModule(original_name=Identity)
    (13): RecursiveScriptModule(original_name=MaxPool2d)
    (14): RecursiveScriptModule(original_name=ConvReLU2d)
    (15): RecursiveScriptModule(or

In [22]:
stats.update()

In [23]:
epoch = 0
t_st = datetime.now()
stats = slayer.utils.LearningStats(accuracy_str='AP@0.5')
ap_stats = metrics.APstats(iou_threshold=0.5)
stats.update()
with torch.no_grad():
        
        for i, (inputs, targets, bboxes) in enumerate(test_loader):
                
                """"inputs = inputs.permute(4,0,1,2,3)
                inputs = inputs.to(device)
                predictions, counts = net(inputs)

                T = 1
                predictions = [utils.nms(predictions[..., t],conf_threshold = args.conf_thres)
                    for t in range(T)]
                for t in range(T):
                    ap_stats.update(predictions[t], bboxes[t])"""

                #start
                inputs = inputs.permute(4,0,1,2,3)
          
                inputs = inputs.squeeze(0)

                inputs = inputs.to('cpu')

                print("input shape ",inputs.shape)

                predictions = loaded_jit_model(inputs)

                print("shapeeeeeeeeeeeeeee ",predictions.shape)
                

                predictions = yolo_raw(predictions)
                predictions = [predictions]

                print("predictionss .... ",predictions[0].shape)
      
                predictions = [prediction.unsqueeze(-1) for prediction in predictions]
                
                #predictions = [torch.sum(prediction,dim=-1).unsqueeze(-1) for prediction in predictions]
                
                #loss, loss_distr = yolo_loss(predictions, targets)

                # MAP calculations
                T = 1 #inputs.shape[-1]
                try:
                    predictions = torch.concat([yolo(p, a) for (p, a)
                        in zip(predictions, net_dummy.anchors)],dim=1)
                except RuntimeError:
                    print('Runtime error on MAP predictions calculation.'
                            'continuing')
                    continue

                predictions = [utils.nms(predictions[..., t],conf_threshold = conf_thres)
                                for t in range(T)]
                for t in range(T):
                    ap_stats.update(predictions[t], bboxes[t])
                    #end

                stats.testing.num_samples += inputs.shape[0]
                stats.testing.correct_samples = ap_stats[:] * stats.testing.num_samples

                processed = i * test_loader.batch_size
                total = len(test_loader.dataset)
                time_elapsed = (datetime.now() - t_st).total_seconds()
                samples_sec = time_elapsed / (i + 1) / test_loader.batch_size
                header_list = [f'Test: [{processed}/{total} '
                                    f'({100.0 * processed / total:.0f}%)]']
  
                stats.print(epoch, i, samples_sec, header=header_list)
                #functional.reset_net(net)

        print('mAP@50/train',stats.training.accuracy,epoch)
        print('mAP@50/test', stats.testing.accuracy, epoch)

input shape  torch.Size([8, 2, 240, 320])
shapeeeeeeeeeeeeeee  torch.Size([8, 21, 8, 10])
predictionss ....  torch.Size([8, 3, 8, 10, 7])
[0A
Epoch    0: i =     0 ,      18.5471 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.13751 
input shape  torch.Size([8, 2, 240, 320])
shapeeeeeeeeeeeeeee  torch.Size([8, 21, 8, 10])
predictionss ....  torch.Size([8, 3, 8, 10, 7])
[5A
Epoch    0: i =     1 ,      11.2866 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.13751 
input shape  torch.Size([8, 2, 240, 320])
shapeeeeeeeeeeeeeee  torch.Size([8, 21, 8, 10])
predictionss ....  torch.Size([8, 3, 8, 10, 7])
[5A
Epoch    0: i =     2 ,       8.4491 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.11042 
input shape  torch.Size([8, 2, 240, 320])
shapeeeeeeeeeeeeeee  torch.Size([8, 21, 8, 10])
predictionss ....  torch.Size([8, 3, 8, 10, 7])
[5A
Epoch    0: i =     3 ,       7.

In [23]:
import onnx
from onnx_tf.backend import prepare
import tensorflow as tf

In [25]:
print(net_dummy)

Network(
  (backend_blocks): ModuleList(
    (0): Conv2d(2, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (8): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (11): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(32,

In [11]:
dummy_input = torch.randn(2,240,320)

In [27]:
input_namess = [ "input_1" ]
#here the net should be the 'net' after the prepare
torch.onnx.export(net,dummy_input.unsqueeze(0),'/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model_240_320.onnx',input_names=input_namess,export_params=True)


In [28]:
qqq = onnx.load('/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model_240_320.onnx')

In [29]:
tf_rep = prepare(qqq)

2024-10-05 21:33:32.289831: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-10-05 21:33:32.290630: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [30]:
print(tf_rep)

<onnx_tf.backend_rep.TensorflowRep object at 0x7f4e35dcc6a0>


In [31]:
tf_rep.export_graph("/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb")

INFO:absl:Function `__call__` contains input name(s) x with unsupported characters which will be renamed to transpose_28_x in the SavedModel.
INFO:absl:Found untraced functions such as gen_tensor_dict while saving (showing 1 of 1). These functions will not be directly callable after loading.


INFO:tensorflow:Assets written to: /home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb/assets


INFO:tensorflow:Assets written to: /home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb/assets
INFO:absl:Writing fingerprint to /home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb/fingerprint.pb


In [32]:
loaded_tf_model = tf.saved_model.load("/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb")

In [33]:
print(loaded_tf_model.signatures.keys())

KeysView(_SignatureMap({'serving_default': <ConcreteFunction (*, input_1: TensorSpec(shape=(1, 2, 240, 320), dtype=tf.float32, name='input_1')) -> Dict[['78', TensorSpec(shape=(1, 21, 8, 10), dtype=tf.float32, name='tensor_78')]] at 0x7F4E34508640>}))


In [34]:
infer = loaded_tf_model.signatures["serving_default"]

In [35]:
print(infer.structured_outputs)

{'78': TensorSpec(shape=(1, 21, 8, 10), dtype=tf.float32, name='tensor_78')}


In [24]:
train_set = evCIVIL.evCIVIL(root = dataset_path ,csv_file_name = train_csv_file , param_dict = param_dict, train = True,augment = True)
test_set = evCIVIL.evCIVIL(root = dataset_path, csv_file_name= test_csv_file, param_dict = param_dict, train = False, augment = False)

prepare_loader = DataLoader(train_set,
                                batch_size=1,
                                shuffle=True,
                                collate_fn=yolo_target.collate_fn,
                                num_workers=1,
                                pin_memory=True)
prepare_test_loader = DataLoader(test_set,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=yolo_target.collate_fn,    
                                num_workers=1,
                                pin_memory=True)
    

In [25]:
from tqdm import tqdm
import numpy as np
#base_path = "/home/udayanga/Udaya_Research_stuff/GAP8_previous_laptop/GAP_stuff/GAP_Tutorial/"
#CALIBRATION_IMGS = [os.path.join(base_path,"img_8.jpg"),os.path.join(base_path,"img_25.jpg"),os.path.join(base_path,"img_32.jpg")]

def representative_dataset():
    for i, (inputs, targets, bboxes) in enumerate(prepare_loader):
        # Pass a batch of images through the model
        inputs = inputs.permute(4,0,1,2,3)
        inputs = inputs.squeeze(0)
        inputs = inputs.to('cpu')
        yield [inputs.numpy().astype(np.float32)]
        #predictions = net_prepared(inputs)
    """for i
    for image in tqdm(CALIBRATION_IMGS):
        temp = np.array(Image.open(image))
        print("temp shape ",temp.shape)
        yield [np.array(temp[np.newaxis,np.newaxis,:,:],dtype=np.float32)]"""

In [26]:
path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb"
converter = tf.lite.TFLiteConverter.from_saved_model(path)
converter.optimizations = [tf.lite.Optimize.DEFAULT]  #[tf.lite.Optimize.OPTIMIZE_FOR_SIZE] #If we directly convert to tflite without quantizations we can ignore this line.
converter.representative_dataset = representative_dataset
tflite_model = converter.convert()

2024-10-06 15:10:03.378083: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:378] Ignored output_format.
2024-10-06 15:10:03.378107: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:381] Ignored drop_control_dependency.
2024-10-06 15:10:03.378261: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb
2024-10-06 15:10:03.378899: I tensorflow/cc/saved_model/reader.cc:51] Reading meta graph with tags { serve }
2024-10-06 15:10:03.378910: I tensorflow/cc/saved_model/reader.cc:146] Reading SavedModel debug info (if present) from: /home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model.pb
2024-10-06 15:10:03.380243: I tensorflow/cc/saved_model/loader.cc:233] Restoring SavedModel bundle.
2024-10-06 15:10:03.388940: I tensorflow/cc/saved_model/loader.cc:217] Running initialization op on Sa

In [27]:
with open('/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model_240_320.tflite','wb') as f:
    f.write(tflite_model)

In [28]:
def get_file_size(file_path):
    size = os.path.getsize(file_path)
    return size

In [29]:
def convert_bytes(size, unit=None):
    if unit == "KB":
        return print('File size: ' + str(round(size / 1024, 3)) + ' Kilobytes')
    elif unit == "MB":
        return print('File size: ' + str(round(size / (1024 * 1024), 3)) + ' Megabytes')
    else:
        return print('File size: ' + str(size) + ' bytes')

In [39]:
#tf_model_path = os.path.join(base_path,"model.pb/saved_model.pb")
tflite_model_path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model_240_320.tflite"
convert_bytes(get_file_size(tflite_model_path),"MB")

File size: 0.312 Megabytes


In [31]:
interpreter = tf.lite.Interpreter(model_path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_320_240/checkpoints/simple_PTQ_model_240_320.tflite")
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
interpreter.allocate_tensors()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [32]:
input_details

[{'name': 'serving_default_input_1:0',
  'index': 0,
  'shape': array([  1,   2, 240, 320], dtype=int32),
  'shape_signature': array([  1,   2, 240, 320], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

In [33]:
output_details

[{'name': 'PartitionedCall:0',
  'index': 58,
  'shape': array([ 1, 21,  8, 10], dtype=int32),
  'shape_signature': array([ 1, 21,  8, 10], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

In [34]:
print("Input Shape:", input_details[0]['shape'])
print("Input Type:", input_details[0]['dtype'])
print("Output Shape:", output_details[0]['shape'])
print("Output Type:", output_details[0]['dtype'])

Input Shape: [  1   2 240 320]
Input Type: <class 'numpy.float32'>
Output Shape: [ 1 21  8 10]
Output Type: <class 'numpy.float32'>


In [36]:
conf_thres = 0.35

In [40]:
t_st = datetime.now()
epoch = 0
stats = slayer.utils.LearningStats(accuracy_str='AP@0.5')
ap_stats = metrics.APstats(iou_threshold=0.5)
stats.update()

with torch.no_grad():
    for i,(inputs,targets,bboxes) in enumerate(prepare_test_loader):
        
        inputs = inputs.permute(4,0,1,2,3)      
        inputs = inputs.squeeze(0)
        inputs = inputs.to('cpu')
        print("input shape ",inputs.shape)
        interpreter.set_tensor(input_details[0]['index'],inputs.numpy())
        interpreter.invoke()

        tflite_model_predictions = interpreter.get_tensor(output_details[0]['index'])
        print(tflite_model_predictions.shape)

        predictions = yolo_raw(torch.from_numpy(tflite_model_predictions))
        predictions = [predictions]

        print("predictionss .... ",predictions[0].shape)
        
        predictions = [prediction.unsqueeze(-1) for prediction in predictions]

        T = 1 #inputs.shape[-1]
        try:
            predictions = torch.concat([yolo(p, a) for (p, a)
                            in zip(predictions, net_dummy.anchors)],dim=1)
        except RuntimeError:
            print('Runtime error on MAP predictions calculation.'
                                'continuing')
            continue

        predictions = [utils.nms(predictions[..., t],conf_threshold = conf_thres)
                                    for t in range(T)]
        
        print("predictions shape ",predictions[0][0].shape)
        print("bboxes shape ",bboxes[0][0].shape)
        print("inputs shape ",inputs.shape)
        

        for t in range(T):
            ap_stats.update(predictions[t], bboxes[t])
            #end

        stats.testing.num_samples += inputs.shape[0]
        stats.testing.correct_samples = ap_stats[:] * stats.testing.num_samples

        processed = i * test_loader.batch_size
        total = len(test_loader.dataset)
        time_elapsed = (datetime.now() - t_st).total_seconds()
        samples_sec = time_elapsed / (i + 1) / test_loader.batch_size
        header_list = [f'Test: [{processed}/{total} '
                                        f'({100.0 * processed / total:.0f}%)]']
    
        stats.print(epoch, i, samples_sec, header=header_list)
        #functional.reset_net(net)
        break

    print('mAP@50/test', stats.testing.accuracy, epoch)

input shape  torch.Size([1, 2, 240, 320])
(1, 21, 8, 10)
predictionss ....  torch.Size([1, 3, 8, 10, 7])
predictions shape  torch.Size([100, 6])
bboxes shape  torch.Size([1, 6])
inputs shape  torch.Size([1, 2, 240, 320])
[0A
Epoch    0: i =     0 ,       6.3805 ms elapsed        
Train  
Test  loss =     0.00000                          AP@0.5 = 0.01500 
mAP@50/test 0.015 0


In [None]:
print("get predictions shape ",predictions[0].shape)

#### QAT - Quantization aware training

In [113]:
from torch.utils.tensorboard import SummaryWriter

In [114]:
class QuantNetwork(torch.nn.Module):

    def __init__(self,
                    in_channels = 4,
                    num_output = num_output):
        
        super().__init__()
        
        self.num_output = num_output
        
        self.backend_blocks = torch.nn.ModuleList([
            
            nn.Conv2d(in_channels, 16, kernel_size=3, padding=1, stride=2, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            
            nn.Conv2d(16, 32, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2,stride = 2),
            #nn.Dropout(0.2),

            nn.Conv2d(32, 32, kernel_size=3, padding=1, stride=2, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2,stride = 2),
            #nn.Dropout(0.2),

            nn.Conv2d(64, 64, kernel_size=3, padding=1, stride=2, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        
        ])

        self.head1_backend = torch.nn.ModuleList([

            nn.Conv2d(64,64, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            #nn.Dropout(0.2),

            nn.Conv2d(64,128, kernel_size=3, padding=1, stride=2, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            #nn.Dropout(0.2),
            
        ])

        self.head1_blocks = torch.nn.ModuleList([

            nn.Conv2d(128,128, kernel_size=3, padding=1, stride=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            #nn.Dropout(0.2),

            nn.Conv2d(128,self.num_output, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(self.num_output),
            nn.ReLU(),

        ])

        self.quant = QuantStub()
        self.dequant = DeQuantStub()

        """self.backend_blocks[1].synapse.weight.requires_grad = False #pool
        self.backend_blocks[6].synapse.weight.requires_grad = False #pool
        self.head2_backend[2].synapse.weight.requires_grad = False #unpool
        self.head2_backend[3].synapse.weight.requires_grad = False"""
    
    def forward(self,
        input
    ):

        count = []
        """for block in self.input_blocks:
            input = block(input)
            count.append(slayer.utils.event_rate(input))
            
        backend = input"""
        input = self.quant(input)
        backend = input
        for block in self.backend_blocks:
            backend = block(backend)
        
        h1_backend = backend
        for block in self.head1_backend:
            h1_backend = block(h1_backend)
        
        head1 = h1_backend
        for block in self.head1_blocks:
            head1 = block(head1)
            """if has_sparisty_loss and isinstance(block,
                                                slayer.block.sigma_delta.Conv):
                sparsity_monitor.append(head1)"""

            """if has_sparisty_loss and isinstance(block,
                                                slayer.block.sigma_delta.Conv):
                sparsity_monitor.append(head2)"""

        ## Get the summation or averaging over all the time steps, and preserve the last dimension of head1 tensor as time but Time = 1

        head1 = self.dequant(head1)

        return head1
    
    def fuse_model(self,quant_aware_train = False):
        # Specify which layers to fuse
        if quant_aware_train:
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU
            # Pooling layers do not need to be fused
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['7', '8', '9'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['10', '11', '12'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.backend_blocks, ['14', '15', '16'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules_qat(self.head1_backend, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.head1_backend, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules_qat(self.head1_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules_qat(self.head1_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU
        
        else:
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU
            # Pooling layers do not need to be fused
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['7', '8', '9'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['10', '11', '12'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.backend_blocks, ['14', '15', '16'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules(self.head1_backend, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.head1_backend, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU

            torch.ao.quantization.fuse_modules(self.head1_blocks, ['0', '1', '2'], inplace=True)  # Conv + BN + ReLU
            torch.ao.quantization.fuse_modules(self.head1_blocks, ['3', '4', '5'], inplace=True)  # Conv + BN + ReLU

In [21]:
train_set = evCIVIL.evCIVIL(root = dataset_path ,csv_file_name = train_csv_file , param_dict = param_dict, train = True,augment = True)
test_set = evCIVIL.evCIVIL(root = dataset_path, csv_file_name= test_csv_file, param_dict = param_dict, train = False, augment = False)

prepare_loader = DataLoader(train_set,
                                batch_size=8,
                                shuffle=True,
                                collate_fn=yolo_target.collate_fn,
                                num_workers=1,
                                pin_memory=True)
prepare_test_loader = DataLoader(test_set,
                                batch_size=8,
                                shuffle=False,
                                collate_fn=yolo_target.collate_fn,    
                                num_workers=1,
                                pin_memory=True)

In [116]:
yolo_loss = yolo_base.YOLOLoss(anchors=net_dummy.anchors,
                             lambda_coord=1.0,
                             lambda_noobj=2.0,
                             lambda_obj=2.0,
                             lambda_cls=4.0,
                             lambda_iou=2.0,
                             alpha_iou=0.8,
                             label_smoothing=0.1).to(torch.device("cuda:0"))

In [117]:
loss_tracker = dict(coord=[], obj=[], noobj=[], cls=[], iou=[])
loss_order = ['coord', 'obj', 'noobj', 'cls', 'iou']
track_iter = 1000
ckpt_folder = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_model/qat_checkpoints/"
epochs = 300

In [118]:
in_channels = 2
num_output = 21
qat_model = QuantNetwork(in_channels=in_channels,num_output=num_output)
#qat_model.load_state_dict(torch.load("nmnist_bnorm_model.pth"))
#qat_model.eval()
qat_model.qconfig = torch.ao.quantization.get_default_qat_qconfig('qnnpack') #x86 or 'qnnpack'
#qat_model.fuse_model(is_qat=True) #is_qat=True only for quantization aware training.
qat_model.fuse_model(quant_aware_train=True)
#qat_model = torch.ao.quantization.fuse_modules_qat(qat_model.nnn, [['0','1','2'],['4','5','6'],['8','9']])
#optimizer = optim.SGD(qat_model.parameters(),lr=0.01,momentum=0.5)
qat_model = torch.ao.quantization.prepare_qat(qat_model)

In [112]:
writer = SummaryWriter("." + '/runs/' + "qat")
optimizer = torch.optim.AdamW(qat_model.parameters(), lr=0.001,weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            epochs,        # * len(train_loader),
    )
current_epoch = 0

In [19]:
print([key for key in qat_model.state_dict().keys()])

['backend_blocks.0.weight', 'backend_blocks.0.bn.weight', 'backend_blocks.0.bn.bias', 'backend_blocks.0.bn.running_mean', 'backend_blocks.0.bn.running_var', 'backend_blocks.0.bn.num_batches_tracked', 'backend_blocks.0.weight_fake_quant.fake_quant_enabled', 'backend_blocks.0.weight_fake_quant.observer_enabled', 'backend_blocks.0.weight_fake_quant.scale', 'backend_blocks.0.weight_fake_quant.zero_point', 'backend_blocks.0.weight_fake_quant.activation_post_process.eps', 'backend_blocks.0.weight_fake_quant.activation_post_process.min_val', 'backend_blocks.0.weight_fake_quant.activation_post_process.max_val', 'backend_blocks.0.activation_post_process.fake_quant_enabled', 'backend_blocks.0.activation_post_process.observer_enabled', 'backend_blocks.0.activation_post_process.scale', 'backend_blocks.0.activation_post_process.zero_point', 'backend_blocks.0.activation_post_process.activation_post_process.eps', 'backend_blocks.0.activation_post_process.activation_post_process.min_val', 'backend_blo

In [22]:
#Load only if it really requires.

pretrained_model_path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_model/qat_checkpoints/epoch_145_0.3345367681206545.pt"
pretrained_model_info = torch.load(pretrained_model_path,map_location="cuda:0")
model_state_dict = pretrained_model_info["model_state_dict"]
qat_model.load_state_dict(model_state_dict)
"""optimizer_dict = pretrained_model_info["optimizer"]
optimizer.load_state_dict(optimizer_dict)
scheduler_dict = pretrained_model_info["scheduler"]
scheduler.load_state_dict(scheduler_dict)
current_epoch = pretrained_model_info["epoch"]
print("loading state dictionary ")"""

  pretrained_model_info = torch.load(pretrained_model_path,map_location="cuda:0")


NameError: name 'qat_model' is not defined

In [108]:
current_epoch

98

In [34]:
conf_thres = 0.02
stats = slayer.utils.LearningStats(accuracy_str='AP@0.5')

In [23]:
anchors = torch.tensor([[0.2800, 0.2200],
        [0.3800, 0.4800],
        [0.9000, 0.7800]])
    
anchors = anchors.unsqueeze(0)

In [36]:
t_st = datetime.now()

for epoch in range(current_epoch,epochs):


    qat_model.to(torch.device('cuda:0'))
    qat_model.train()
    ap_stats = metrics.APstats(iou_threshold=0.5)

    for i, (inputs, targets, bboxes) in enumerate(train_loader):            

        #start
        inputs = inputs.permute(4,0,1,2,3)
            
        inputs = inputs.squeeze(0)

        inputs = inputs.to(torch.device('cuda:0'))

        #print("input shape ",inputs.shape)

        predictions = qat_model(inputs)

        predictions = yolo_raw(predictions)
        predictions = [predictions]
        #print("predictionss .... ",predictions[0].shape)
        predictions = [prediction.unsqueeze(-1) for prediction in predictions]

        #print("predictions .... ",predictions)

        loss, loss_distr = yolo_loss(predictions, targets)

        if torch.isnan(loss):
            print("loss is nan, continuing")
            continue
        
        clip_val = 1.0
        optimizer.zero_grad()
        loss.backward()
        validate_gradients(qat_model) #net.validate_gradients()
        torch.nn.utils.clip_grad_norm_(qat_model.parameters(), clip_val)
        optimizer.step()

        # MAP calculations
        T = 1 #inputs.shape[-1]
        try:
            predictions = torch.concat([yolo(p, a) for (p, a)
                                                    in zip(predictions, net_dummy.anchors.to(torch.device('cuda:0')))],dim=1)
        except RuntimeError:
            print('Runtime error on MAP predictions calculation.'
                            'continuing')
            continue

        predictions = [utils.nms(predictions[..., t],conf_threshold = conf_thres)
                                for t in range(T)]
        
                    
        for t in range(T):
            ap_stats.update(predictions[t], bboxes[t])

        if not torch.isnan(loss):
            stats.training.loss_sum += loss.item() * inputs.shape[0]

        stats.training.num_samples += inputs.shape[0]
        stats.training.correct_samples = ap_stats[:] * \
                        stats.training.num_samples

        processed = i * train_loader.batch_size
        total = len(train_loader.dataset)
        time_elapsed = (datetime.now() - t_st).total_seconds()
        samples_sec = time_elapsed / (i + 1) / train_loader.batch_size
        header_list = [f'Train: [{processed}/{total} '
                                f'({100.0 * processed / total:.0f}%)]']

        header_list += [f'Coord loss: {loss_distr[0].item()}']
        header_list += [f'Obj   loss: {loss_distr[1].item()}']
        header_list += [f'NoObj loss: {loss_distr[2].item()}']
        header_list += [f'Class loss: {loss_distr[3].item()}']
        header_list += [f'IOU   loss: {loss_distr[4].item()}']

        if i % track_iter == 0:
            #plt.figure()
            for loss_idx, loss_key in enumerate(loss_order):
                loss_tracker[loss_key].append(loss_distr[loss_idx].item())
                            #plt.semilogy(loss_tracker[loss_key], label=loss_key)

        writer.add_scalar(f'Loss Tracker/{loss_key}',
                                                loss_distr[loss_idx].item(),
                                                len(loss_tracker[loss_key]) - 1)
                            
        stats.print(epoch, i, samples_sec, header=header_list)

    current_val_acc = 0
    qat_model.to(torch.device('cpu'))
    quantized_model = torch.quantization.convert(qat_model)
    quantized_model.eval()

    with torch.no_grad():
            ap_stats = metrics.APstats(iou_threshold=0.5)
            
            for i, (inputs, targets, bboxes) in enumerate(test_loader):

                    #start
                    inputs = inputs.permute(4,0,1,2,3)
            
                    inputs = inputs.squeeze(0)

                    inputs = inputs.to('cpu')

                    #print("input shape ",inputs.shape)

                    predictions = quantized_model(inputs)

                    #print("shapeeeeeeeeeeeeeee ",predictions.shape)
                    
                
                    predictions = yolo_raw(predictions)
                    predictions = [predictions]

                    #print("predictionss .... ",predictions[0].shape)
        
                    predictions = [prediction.unsqueeze(-1) for prediction in predictions]
                    
                    #predictions = [torch.sum(prediction,dim=-1).unsqueeze(-1) for prediction in predictions]
                    
                    #loss, loss_distr = yolo_loss(predictions, targets)

                    # MAP calculations
                    T = 1 #inputs.shape[-1]
                    try:
                        predictions = torch.concat([yolo(p, a) for (p, a)
                            in zip(predictions, net_dummy.anchors)],dim=1)
                    except RuntimeError:
                        print('Runtime error on MAP predictions calculation.'
                                'continuing')
                        continue

                    predictions = [utils.nms(predictions[..., t],conf_threshold = conf_thres)
                                    for t in range(T)]
                    for t in range(T):
                        ap_stats.update(predictions[t], bboxes[t])
                        #end

                    stats.testing.num_samples += inputs.shape[0]
                    stats.testing.correct_samples = ap_stats[:] * stats.testing.num_samples

                    processed = i * test_loader.batch_size
                    total = len(test_loader.dataset)
                    time_elapsed = (datetime.now() - t_st).total_seconds()
                    samples_sec = time_elapsed / (i + 1) / test_loader.batch_size
                    header_list = [f'Test: [{processed}/{total} '
                                        f'({100.0 * processed / total:.0f}%)]']
    
                    stats.print(epoch, i, samples_sec, header=header_list)
                    #functional.reset_net(net)"""
                    

    writer.add_scalar('Loss/train', stats.training.loss, epoch)
    writer.add_scalar('mAP@50/train', stats.training.accuracy, epoch)
    writer.add_scalar('mAP@50/test', stats.testing.accuracy, epoch)

    if stats.testing.accuracy > current_val_acc:
        checkpoint = {"epoch": epoch,
                                "model_state_dict": qat_model.state_dict(),   #module.state_dict(),
                                "optimizer": optimizer.state_dict(),
                                "scheduler": scheduler.state_dict()}
        full_ckpt_path = ckpt_folder + "/epoch_" + str(epoch) + "_" + str(stats.testing.accuracy) + ".pt"
        print("saving net_prepared model ... ")
        torch.save(checkpoint,full_ckpt_path)
        current_val_acc = stats.testing.accuracy
        jit_quant_model_path = ckpt_folder + "/jit_epoch_" + str(epoch) + "_" + str(stats.testing.accuracy) + ".pt"
        print("saving jit quant model ..... ")
        torch.jit.save(torch.jit.script(quantized_model), jit_quant_model_path)
    else:
        if epoch % 2 == 0:
            checkpoint = {"epoch": epoch,
                                    "model_state_dict": qat_model.state_dict(),   #module.state_dict(),
                                    "optimizer": optimizer.state_dict(),
                                    "scheduler": scheduler.state_dict()}
        full_ckpt_path = ckpt_folder + "/epoch_" + str(epoch) + "_" + str(stats.testing.accuracy) + ".pt"
        print("saving ")
        torch.save(checkpoint,full_ckpt_path)
            
    stats.update()
    stats.save(ckpt_folder)
    scheduler.step()


[9A
[2KCoord loss: 0.132863849401474
[2KObj   loss: 1.3561460971832275
[2KNoObj loss: 1.3964393138885498
[2KClass loss: 1.627793312072754
[2KIOU   loss: 0.7723892331123352
Epoch    0: i =     0 ,      26.5916 ms elapsed        
Train loss =     4.91580 (min =     4.81175)     AP@0.5 = 0.26003 (max = 0.34722) 
[9A
[2KCoord loss: 0.2360049933195114
[2KObj   loss: 1.3248523473739624
[2KNoObj loss: 1.4047110080718994
[2KClass loss: 1.1733357906341553
[2KIOU   loss: 0.7880774736404419
Epoch    0: i =     1 ,      16.2332 ms elapsed        
Train loss =     4.91587 (min =     4.81175)     AP@0.5 = 0.19879 (max = 0.34722) 
[9A
[2KCoord loss: 0.17342327535152435
[2KObj   loss: 1.3226360082626343
[2KNoObj loss: 1.4039744138717651
[2KClass loss: 1.1190481185913086
[2KIOU   loss: 0.8769333362579346
Epoch    0: i =     2 ,      12.5592 ms elapsed        
Train loss =     4.91575 (min =     4.81175)     AP@0.5 = 0.19881 (max = 0.34722) 
[9A
[2KCoord loss: 0.06240746006369591
[2

KeyboardInterrupt: 

In [1]:
import torch

In [5]:
path = "/home/udayanga/Udaya_Research_stuff/2024_GAP8_work/yolov3_ann_head1_model/qat_checkpoints/jit_epoch_145_0.3345367681206545.pt"
temp = torch.jit.load(path)

In [7]:
temp

RecursiveScriptModule(
  original_name=QuantNetwork
  (backend_blocks): RecursiveScriptModule(
    original_name=ModuleList
    (0): RecursiveScriptModule(original_name=ConvReLU2d)
    (1): RecursiveScriptModule(original_name=Identity)
    (2): RecursiveScriptModule(original_name=Identity)
    (3): RecursiveScriptModule(original_name=ConvReLU2d)
    (4): RecursiveScriptModule(original_name=Identity)
    (5): RecursiveScriptModule(original_name=Identity)
    (6): RecursiveScriptModule(original_name=MaxPool2d)
    (7): RecursiveScriptModule(original_name=ConvReLU2d)
    (8): RecursiveScriptModule(original_name=Identity)
    (9): RecursiveScriptModule(original_name=Identity)
    (10): RecursiveScriptModule(original_name=ConvReLU2d)
    (11): RecursiveScriptModule(original_name=Identity)
    (12): RecursiveScriptModule(original_name=Identity)
    (13): RecursiveScriptModule(original_name=MaxPool2d)
    (14): RecursiveScriptModule(original_name=ConvReLU2d)
    (15): RecursiveScriptModule(or