In [1]:
import tensorrt as trt
import numpy as np
from cuda import cuda, cudart
from tensorrt import BuilderFlag
from typing import Optional, List
import ctypes

class HostDeviceMem:
    """Pair of host and device memory, where the host memory is wrapped in a numpy array"""
    def __init__(self, size: int, dtype: np.dtype):
        nbytes = size * dtype.itemsize
        host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
        pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))

        self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
        self._device = cuda_call(cudart.cudaMalloc(nbytes))
        self._nbytes = nbytes

    @property
    def host(self) -> np.ndarray:
        return self._host

    @host.setter
    def host(self, arr: np.ndarray):
        if arr.size > self.host.size:
            raise ValueError(
                f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
            )
        np.copyto(self.host[:arr.size], arr.flat, casting='safe')

    @property
    def device(self) -> int:
        return self._device

    @property
    def nbytes(self) -> int:
        return self._nbytes

    def __str__(self):
        return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"

    def __repr__(self):
        return self.__str__()

    def free(self):
        cuda_call(cudart.cudaFree(self.device))
        cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
        
        
def check_cuda_err(err):
    if isinstance(err, cuda.CUresult):
        if err != cuda.CUresult.CUDA_SUCCESS:
            raise RuntimeError("Cuda Error: {}".format(err))
    if isinstance(err, cudart.cudaError_t):
        if err != cudart.cudaError_t.cudaSuccess:
            raise RuntimeError("Cuda Runtime Error: {}".format(err))
    else:
        raise RuntimeError("Unknown error type: {}".format(err))

def cuda_call(call):
    err, res = call[0], call[1:]
    check_cuda_err(err)
    if len(res) == 1:
        res = res[0]
    return res

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# If engine uses dynamic shapes, specify a profile to find the maximum input & output size.
def allocate_buffers(engine: trt.ICudaEngine, profile_idx: Optional[int] = None):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda_call(cudart.cudaStreamCreate())
    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
    for binding in tensor_names:
        # get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
        # Pick out the max shape to allocate enough memory for the binding.
        shape = engine.get_tensor_shape(binding) if profile_idx is None else engine.get_tensor_profile_shape(binding, profile_idx)[-1]
        shape_valid = np.all([s >= 0 for s in shape])
        if not shape_valid and profile_idx is None:
            raise ValueError(f"Binding {binding} has dynamic shape, " +\
                "but no profile was specified.")
        size = trt.volume(shape)
        if engine.has_implicit_batch_dimension:
            size *= engine.max_batch_size
        dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))

        # Allocate host and device buffers
        bindingMemory = HostDeviceMem(size, dtype)

        # Append the device buffer to device bindings.
        bindings.append(int(bindingMemory.device))

        # Append to the appropriate list.
        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append(bindingMemory)
        else:
            outputs.append(bindingMemory)
    return inputs, outputs, bindings, stream

def free_buffers(inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
    for mem in inputs + outputs:
        mem.free()
    cuda_call(cudart.cudaStreamDestroy(stream))

In [81]:
logger = trt.Logger(trt.Logger.WARNING)

with open('model.engine', 'rb') as f:
    serialized_engine = f.read()

runtime = trt.Runtime(logger)

engine = runtime.deserialize_cuda_engine(serialized_engine)


In [82]:
context = engine.create_execution_context()


In [83]:
inputs, outputs, bindings, stream = allocate_buffers(engine)

In [5]:
import cv2
img_path = 'moto.jpeg'

image_raw = cv2.imread(img_path)
image = cv2.resize(np.array(image_raw), (640,640), interpolation = cv2.INTER_LINEAR)
image = np.transpose(image, (2,0,1))

image = np.expand_dims(image, axis=0)

shape_orig = image_raw.size

inputs[0].host = image

In [7]:
def _do_inference_base(inputs, outputs, stream, execute_async):
    # Transfer input data to the GPU.
    kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
    [cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
    # Run inference.
    execute_async()
    # Transfer predictions back from the GPU.
    kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
    [cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
    # Synchronize the stream
    cuda_call(cudart.cudaStreamSynchronize(stream))
    # Return only the host outputs.
    return [out.host for out in outputs]

def do_inference_v2(context, bindings, inputs, outputs, stream):
    def execute_async():
        context.execute_async_v2(bindings=bindings, stream_handle=stream)
    return _do_inference_base(inputs, outputs, stream, execute_async)

In [8]:
def nms(bounding_boxes, confidence_score, threshold):
    # If no bounding boxes, return empty list
    if len(bounding_boxes) == 0:
        return [], []

    # Bounding boxes
    boxes = np.array(bounding_boxes)

    # coordinates of bounding boxes
    start_x = boxes[:, 0]
    start_y = boxes[:, 1]
    end_x = boxes[:, 2]
    end_y = boxes[:, 3]

    # Confidence scores of bounding boxes
    score = np.array(confidence_score)

    # Picked bounding boxes
    picked_boxes = []
    picked_score = []

    # Compute areas of bounding boxes
    areas = (end_x - start_x + 1) * (end_y - start_y + 1)

    # Sort by confidence score of bounding boxes
    order = np.argsort(score)

    # Iterate bounding boxes
    while order.size > 0:
        # The index of largest confidence score
        index = order[-1]

        # Pick the bounding box with largest confidence score
        picked_boxes.append(bounding_boxes[index])
        picked_score.append(confidence_score[index])

        # Compute ordinates of intersection-over-union(IOU)
        x1 = np.maximum(start_x[index], start_x[order[:-1]])
        x2 = np.minimum(end_x[index], end_x[order[:-1]])
        y1 = np.maximum(start_y[index], start_y[order[:-1]])
        y2 = np.minimum(end_y[index], end_y[order[:-1]])

        # Compute areas of intersection-over-union
        w = np.maximum(0.0, x2 - x1 + 1)
        h = np.maximum(0.0, y2 - y1 + 1)
        intersection = w * h

        # Compute the ratio between intersection and union
        ratio = intersection / (areas[index] + areas[order[:-1]] - intersection)

        left = np.where(ratio < threshold)
        order = order[left]

    return picked_boxes, picked_score

In [32]:
print(engine.get_binding_shape(0))
print(engine.get_binding_shape(1))
print(engine.get_binding_shape(2))


(1, 3, 640, 640)
(1, 1000, 4)
(1, 1000, 5)


  print(engine.get_binding_shape(0))
  print(engine.get_binding_shape(1))
  print(engine.get_binding_shape(2))


In [84]:
import time
output_shapes = [(1,1000,4),(1,1000,5)]
times=[]
for i in range(1,50):
    t0=time.time()
    trt_outputs = do_inference_v2(context , bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
    t1= time.time()
    times.append(1/(t1-t0))
    
print(times)
trt_outputs = [output.reshape(shape) for output, shape in zip(trt_outputs, output_shapes)]

[18.734022073635362, 22.715133659720117, 19.078112704629085, 21.016495299941877, 20.783532944516846, 20.980121849958483, 20.724996170551293, 19.929221704837023, 19.16931669127023, 21.5251467750544, 22.411935066739336, 20.071609392870645, 20.21926234447385, 19.817450755267025, 19.001621861607184, 20.846855802302233, 20.31228479691609, 20.665464471181796, 19.453378353307855, 19.285575030806864, 19.815484626868493, 20.18306843171505, 20.14806844274501, 20.3860331285481, 20.25607541629641, 19.712020453146224, 20.2967543999729, 19.658526982817612, 20.011660694775113, 20.37444683548608, 19.854975455272736, 20.461416877250155, 19.84727131628882, 20.193854657153036, 19.73994484134828, 19.77932140246634, 19.71072356703463, 20.34203570510551, 19.851028685828414, 17.18927735679714, 19.79995656976689, 19.766644202628765, 19.439583613349956, 20.010992366412214, 20.24385464479292, 19.71581945867687, 19.797993910929645, 19.941539485570296, 19.693787093381413]


In [85]:
vec = []
boxes = []
scores = []
import torch
sf = torch.nn.Softmax(dim=1)
for i in range(0,1000):
    xc,yc,w,h = trt_outputs[0][0,i,:]
    
    out = list(trt_outputs[1][0,i,:] )
    
    boxes.append( (xc,yc,w,h) )
    scores.append(max(out))
    


In [86]:
boxes_nms, scores_nms = nms(boxes, scores,0.05)
len(boxes_nms)

23

Provo con yolo nas non quantizzato, converto e vedo le differenze nelle dimensioni.... vorrei capire dove sono le classi... ora ho soltanto 4 sembra!!!! dovrebbero essere 5

-- yolo_nas_s(standard) : (1,1000,4)(1,1000,80)
-- (retrain quantized)  : (1,8400,4)(1,8400,5) (invertiti non so perchè)

In [87]:
from PIL import ImageDraw
from PIL import Image
image = Image.open(img_path)
image = image.resize((640,640))
draw = ImageDraw.Draw(image)
for box in boxes_nms:
        x1,y1,x2,y2 = box[0],box[1],box[2],box[3]
        #print(f'{x1}_{y1}_{x2}_{y2}')
        bbox = [(x1,y1),(x2,y2)]
        draw.rectangle(bbox, outline="black")
        
image.show()

In [None]:
free_buffers(inputs, outputs, stream)