In [18]:
import math
import time
import os
import json

import torch as T
import torch.nn as nn
import torchvision.models as models
import matplotlib.pyplot as plt

from deformable_attention import DeformableAttention

from model.endtoendmodels import SpatialEncoder
from model.benchmarking import model_speedtest
from core.torchhelpers import positional_embedding


In [2]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, input_dim: int, output_dim: int):
        super(type(self), self).__init__()

        self.LN1 = nn.LayerNorm(normalized_shape=input_dim, eps=0.00001, elementwise_affine=True)
        self.LN2 = nn.LayerNorm(normalized_shape=input_dim, eps=0.00001, elementwise_affine=True)
        
        self.MSA = nn.MultiheadAttention(
            embed_dim=input_dim,
            num_heads=1,
            dropout=0,
            bias=True,
            add_bias_kv=False,
            add_zero_attn=False,
            batch_first=True)
        
        self.MLP = nn.Sequential(
            nn.Linear(in_features=input_dim, out_features=(input_dim + output_dim) // 2, bias=True),
            nn.ELU(),
            nn.Linear(in_features=(input_dim + output_dim) // 2, out_features=output_dim, bias=True),
            nn.ELU(),
        )

        self.residual_connection = nn.Sequential(nn.Linear(input_dim, output_dim), nn.ELU())

    def forward(self, x: T.Tensor):

        residual = self.residual_connection(x)

        x = self.LN1(x)
        x, attention_matrix = self.MSA(x, x, x)
        
        x = self.MLP(self.LN2(x))

        x = T.add(residual, x)

        return x

# TransformerEncoderBlock(256, 128)(T.rand(1, 3, 256)).shape

In [142]:
class ConvBlock(nn.Module):
    def __init__(self,
                 in_channels: int, 
                 out_channels: int,
                 kernel_size: int,
                 stride: int = 1,
                 bias: bool = False,
                 dilation: int = 1,
                 padding: int = 0,
                 groups: int = 1
                ):
        super(type(self), self).__init__()
        
        self.conv = nn.Conv2d(
            in_channels=in_channels, out_channels=out_channels,
            kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, 
            groups=groups, bias=bias, padding_mode = "zeros")
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.SiLU()

    def forward(self, x: T.Tensor):
        return self.act(self.bn(self.conv(x)))

class BottleneckBlock(nn.Module):
    def __init__(self, 
                 in_channels: int, 
                 out_channels: int,
                 kernel_size: int,
                 shortcut: bool = True
                ):
        super(type(self), self).__init__()
        
        self.conv1 = ConvBlock(in_channels, (in_channels + out_channels) // 2, kernel_size, padding=1)
        self.conv2 = ConvBlock((in_channels + out_channels) // 2, out_channels, kernel_size, padding=1)

        self.add = shortcut and in_channels == out_channels
        
    def forward(self, x):
        return x + self.conv2(self.conv1(x)) if self.add else self.conv2(self.conv1(x))
        
class C2f(nn.Module):
    # CSP Bottleneck with 2 convolutions
    def __init__(self, 
                 in_channels: int, 
                 out_channels: int,
                 kernel_size: int,
                 n: int = 1,
                 shortcut: bool = True
                ):
        super(type(self), self).__init__()

        self.hidden_channels = (in_channels + out_channels) // 2

        self.conv1 = ConvBlock(in_channels, self.hidden_channels * 2, kernel_size=kernel_size, stride=1)
        
        self.module_list = nn.ModuleList(BottleneckBlock(
            in_channels=self.hidden_channels, 
            out_channels=self.hidden_channels, 
            kernel_size=3, 
            shortcut=shortcut) for _ in range(n))

        self.conv2 = ConvBlock((2 + n) * self.hidden_channels, out_channels, kernel_size=kernel_size, stride=1)

    def forward(self, x):
        y = list(self.conv1(x).split((self.hidden_channels, self.hidden_channels), 1))
        y.extend(module(y[-1]) for module in self.module_list)
        return self.conv2(T.cat(y, 1))


class SPPF(nn.Module):
    # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
    def __init__(self, 
                 in_channels: int, 
                 out_channels: int,
                 maxpool_kernel_size=5 # equivalent to SPP(k=(5, 9, 13))
                ):
        super().__init__()
        
        self.conv1 = ConvBlock(in_channels, in_channels, kernel_size=1, stride=1)
        self.conv2 = ConvBlock(in_channels * 4, out_channels, kernel_size=1, stride=1)
        self.max_pool = nn.MaxPool2d(kernel_size=maxpool_kernel_size, stride=1, padding=maxpool_kernel_size//2)

    def forward(self, x):
        x = self.conv1(x)
        x1 = self.max_pool(x)
        x2 = self.max_pool(x1)
        x3 = self.max_pool(x2)
        return self.conv2(T.cat([x, x1, x2, x3], 1))


class Backbone(nn.Module):
    def __init__(self, 
                 depth_multiple = 0.33,
                 width_multiple = 0.25
                ):
        super(type(self), self).__init__()

        scaled_64 = int(64 * width_multiple)
        scaled_128 = int(128 * width_multiple)
        scaled_256 = int(256 * width_multiple)
        scaled_512 = int(512 * width_multiple)
        scaled_1024 = int(1024 * width_multiple)
        
        self.conv1 = nn.Sequential(
            ConvBlock(3, scaled_64, kernel_size=3, stride=2, padding=1),
            ConvBlock(scaled_64, scaled_128, kernel_size=3, stride=2, padding=1),
            C2f(scaled_128, scaled_128, kernel_size=1, n=int(3*depth_multiple), shortcut=True),
            ConvBlock(scaled_128, scaled_256, kernel_size=3, stride=2, padding=1),
            C2f(scaled_256, scaled_256, kernel_size=1, n=int(6*depth_multiple), shortcut=True)
        )
        
        self.conv2 = nn.Sequential(
            ConvBlock(scaled_256, scaled_512, kernel_size=3, stride=2, padding=1),
            C2f(scaled_512, scaled_512, kernel_size=1, n=int(6*depth_multiple), shortcut=True),
        )
        
        self.conv3 = nn.Sequential(
            ConvBlock(scaled_512, scaled_1024, kernel_size=3, stride=2, padding=1),
            C2f(scaled_1024, scaled_1024, kernel_size=1, n=int(3*depth_multiple), shortcut=True),
            SPPF(scaled_1024, scaled_1024)
        )

    def forward(self, x: T.Tensor) -> tuple[T.Tensor, T.Tensor, T.Tensor]:

        x1 = self.conv1(x)
        x2 = self.conv2(x1)
        x3 = self.conv3(x2)

        return x1, x2, x3



In [156]:

class UpsampleTransformerNeck(nn.Module):
    def __init__(self, 
                 output_size,
                 depth_multiple = 0.33,
                 width_multiple = 0.25,
                 original_positional_embedding_size: int = 256
                ):
        super(type(self), self).__init__()

        scaled_256 = int(256 * width_multiple)
        scaled_512 = int(512 * width_multiple)
        scaled_1024 = int(1024 * width_multiple)
        scaled_positional_embedding_size = int(original_positional_embedding_size * width_multiple)
        
        self.upsample_x2 = nn.UpsamplingBilinear2d(scale_factor=2)
        
        self.reverse1 = nn.Sequential(
            C2f(scaled_1024 + scaled_512, scaled_512, kernel_size=1, n=int(3*depth_multiple), shortcut=False)
        )
        self.reverse2 = nn.Sequential(
            C2f(scaled_512 + scaled_256, scaled_512, kernel_size=1, n=int(3*depth_multiple), shortcut=False),
            ConvBlock(scaled_512, scaled_512, kernel_size=3, stride=2, padding=1)
        )
        self.reverse3 = nn.Sequential(
            C2f(scaled_512 * 2, scaled_512, kernel_size=1, n=int(3*depth_multiple), shortcut=False),
            ConvBlock(scaled_512, scaled_1024, kernel_size=3, stride=2, padding=1)
        )
        
        self.Q_encoder = ConvBlock(scaled_1024, scaled_1024, kernel_size=1, stride=1, padding=0)
        self.K_encoder = ConvBlock(scaled_1024, scaled_1024, kernel_size=1, stride=1, padding=0)
        self.V_encoder = nn.Sequential(
            # ConvBlock(scaled_1024, scaled_1024, kernel_size=1, stride=1, padding=0),
            # C2f(scaled_1024, scaled_1024, kernel_size=1, n=int(3*depth_multiple), shortcut=True),
            # ConvBlock(scaled_1024, scaled_1024, kernel_size=1, stride=1, padding=0),
            # C2f(scaled_1024, scaled_1024, kernel_size=1, n=int(3*depth_multiple), shortcut=True),
            ConvBlock(scaled_1024, scaled_1024 + scaled_positional_embedding_size, kernel_size=1, stride=1, padding=0)
        )

        self.positional_embedding = nn.Parameter(positional_embedding(20*20, scaled_positional_embedding_size), requires_grad=False)

        self.output_1d_conv = nn.Conv1d(scaled_1024 + scaled_positional_embedding_size, output_size, kernel_size=1, stride=1, padding=0, groups=1)

    
    def forward(self, x1: T.Tensor, x2: T.Tensor, x3: T.Tensor) -> T.Tensor: 
        # x1, x2, x3 in order: middle of the backbone to final layer output

        y1 = T.cat([self.upsample_x2(x3), x2], 1)
        y1 = self.reverse1(y1)

        y2 = T.cat([self.upsample_x2(y1), x1], 1)
        y2 = self.reverse2(y2)

        y3 = T.cat([y1, y2], 1)
        y3 = self.reverse3(y3)

        Q = self.Q_encoder(y3).flatten(2).permute(0, 2, 1)
        Q = T.cat([Q, self.positional_embedding[:Q.shape[1]].expand(Q.shape[0], -1, -1)], 2)

        K = self.K_encoder(y3).flatten(2).permute(0, 2, 1)
        K = T.cat([K, self.positional_embedding[:K.shape[1]].expand(K.shape[0], -1, -1)], 2)

        V = self.V_encoder(x3).flatten(2).permute(0, 2, 1)

        out = nn.functional.scaled_dot_product_attention(Q, K, V)
        out = self.output_1d_conv(out.permute(0, 2, 1))
        
        return out
        

class SimpleTransformerHead(nn.Module):
    def __init__(self, 
                 input_size,
                 output_size,
                 output_length,
                 width_multiple = 0.25,
                ):
        super(type(self), self).__init__()

        scaled_256 = int(256 * width_multiple)
        scaled_512 = int(512 * width_multiple)
        scaled_1024 = int(1024 * width_multiple)

        self.output_length = output_length

        self.input_1d_conv = nn.Sequential(
            nn.Conv1d(input_size, scaled_1024, kernel_size=1, stride=1, padding=0, groups=1),
            nn.SiLU(),
            nn.BatchNorm1d(scaled_1024)
        )
        
        self.positional_embedding = nn.Parameter(positional_embedding(20*20, scaled_1024), requires_grad=False)

        self.self_attention = TransformerEncoderBlock(scaled_1024, output_size)
        
        # self.forward_encoder = nn.Sequential(
        #     nn.Linear(scaled_1024, scaled_1024),
        #     nn.SiLU(),
        #     nn.BatchNorm1d(scaled_1024),
        #     nn.Linear(scaled_1024, scaled_1024),
        #     nn.SiLU(),
        #     nn.BatchNorm1d(scaled_1024)
        # )
        self.internal_state = nn.Parameter(T.rand(output_length, scaled_1024, dtype=T.float32), requires_grad=True)

    
    def forward(self, x: T.Tensor, corrections: int = 3) -> T.Tensor:

        x = self.input_1d_conv(x).permute(0, 2, 1)

        x = T.cat([x, self.internal_state.expand(x.shape[0], -1, -1)], 1)

        x = self.self_attention(x)

        return x[:, -self.output_length:, :]


class CodebookTransformerHead(nn.Module):
    def __init__(self, 
                 input_size,
                 output_size,
                 width_multiple = 0.25,
                ):
        super(type(self), self).__init__()
        raise NotImplementedError()

        
class CascadedTransformerHead(nn.Module):
    def __init__(self, 
                 input_size,
                 output_size,
                 width_multiple = 0.25,
                ):
        super(type(self), self).__init__()

        scaled_256 = int(256 * width_multiple)
        scaled_512 = int(512 * width_multiple)
        scaled_1024 = int(1024 * width_multiple)

        self.input_1d_conv = nn.Conv1d(input_size, scaled_1024, kernel_size=1, stride=1, padding=0, groups=1)
        self.input_1d_conv_memory = nn.Conv1d(input_size, scaled_1024, kernel_size=1, stride=1, padding=0, groups=1)
        
        self.positional_embedding = nn.Parameter(positional_embedding(20*20, scaled_1024), requires_grad=False)

        self.forward_encoder = nn.Sequential(
            nn.Linear(scaled_1024, scaled_1024),
            nn.SiLU(),
            nn.BatchNorm1d(scaled_1024),
            nn.Linear(scaled_1024, scaled_1024),
            nn.SiLU(),
            nn.BatchNorm1d(scaled_1024)
        )

        self.R = nn.Parameter(T.rand(output_size, dtype=T.float32), requires_grad=True)
    
    def forward(self, x: T.Tensor, corrections: int = 3) -> T.Tensor:

        raise NotImplementedError()

        M = self.input_1d_conv_memory(x).permute(0, 2, 1)
        R = self.R
        X = self.input_1d_conv(x).permute(0, 2, 1)

        for _ in range(corrections):

            q = X + self.positional_embedding.expand(X.shape[0], -1, -1)
            k = X + self.positional_embedding.expand(X.shape[0], -1, -1)
            v = X

            Q = X + nn.functional.scaled_dot_product_attention(q, k, v)

            print(Q.shape, M.shape, R.shape)
            
            y = nn.functional.scaled_dot_product_attention(Q, M, R)
            y = y + Q
            y = self.forward_encoder(y)

            print(y.shape)

        
        return out
        
class DeformableAttentionHead(nn.Module):
    def __init__(self, 
                 input_size,
                 output_size,
                 width_multiple = 0.25,
                ):
        super(type(self), self).__init__()

        scaled_256 = int(256 * width_multiple)
        scaled_512 = int(512 * width_multiple)
        scaled_1024 = int(1024 * width_multiple)
        
        self.positional_embedding = nn.Parameter(positional_embedding(20*20, input_size), requires_grad=False)

        self.deformable_attn = DeformableAttention(
            dim = input_size,
            dim_head = input_size // 8,               # dimension per head
            heads = 8,
            dropout = 0.,
            downsample_factor = 4,       # downsample factor (r in paper)
            offset_scale = 4,            # scale of offset, maximum offset
            offset_groups = None,        # number of offset groups, should be multiple of heads
            offset_kernel_size = 6,
        )
    
    def forward(self, x: T.Tensor) -> T.Tensor:
        raise NotImplementedError()
        x = x.reshape(x.shape[0], x.shape[1], int(math.sqrt(x.shape[2])), int(math.sqrt(x.shape[2])))
        out = self.deformable_attn(x)
        return out

class FullModel(nn.Module):
    def __init__(self, 
                 neck_output_size,
                 head_output_size,
                 head_output_length,
                 depth_multiple = 0.33,
                 width_multiple = 0.25
                ):
        super(type(self), self).__init__()

        self.backbone = Backbone(depth_multiple, width_multiple)
        self.neck = UpsampleTransformerNeck(neck_output_size, depth_multiple, width_multiple)
        
        self.head = SimpleTransformerHead(neck_output_size, head_output_size, head_output_length, depth_multiple)
        
        print(f"running with neck: {self.neck.__class__}")

    def forward(self, x: T.Tensor):
        
        neck_out = self.neck(*self.backbone(x))
        
        return self.head(neck_out)

In [173]:


X = T.rand(1, 3, 640, 640).to("cuda")
net = FullModel(512, 3, 98, 0.25).to("cuda").eval()


running with neck: <class '__main__.UpsampleTransformerNeck'>
torch.Size([1, 98, 3])
model size: 3.383514
fps: 218.67487831943072


## Interesting Models

In [2]:
most_interesting_models = {
    "regnet_x_400mf": models.get_model("regnet_x_400mf").to("cuda"),
    "regnet_x_800mf": models.get_model("regnet_x_800mf").to("cuda"),
    "regnet_y_400mf": models.get_model("regnet_y_400mf").to("cuda"),
    "regnet_y_800mf": models.get_model("regnet_y_800mf").to("cuda"),
    "vit_b_32": models.get_model("vit_b_32").to("cuda"),
}

for model_name in most_interesting_models:
    print(model_name)
    model_speedtest(most_interesting_models[model_name], (10, 3, 224, 224))
    print()

regnet_x_400mf
speedtest engaged ...
number of parameters: 5.495975971221924 M
test completed
fps: 121.81525505151455

regnet_x_800mf
speedtest engaged ...
number of parameters: 7.259655952453613 M
test completed
fps: 95.65943772882376

regnet_y_400mf
speedtest engaged ...
number of parameters: 4.344143867492676 M
test completed
fps: 97.7023625974944

regnet_y_800mf
speedtest engaged ...
number of parameters: 6.432511806488037 M
test completed
fps: 92.50598632228807

vit_b_32
speedtest engaged ...
number of parameters: 88.22423553466797 M
test completed
fps: 63.93281645726603



## PyTorch Pretrained Models Speedtest

### All Models

In [36]:
classiciation_models = models.list_models()

for model_name in classiciation_models:
    print(model_name)
    model_speedtest(models.get_model(model_name).to("cuda"), (1, 3, 224, 224))
    print()

vit_b_16
speedtest engaged ...
number of parameters: 86.56765747070312 M
test completed
fps: 112.87943783821194

vit_b_32
speedtest engaged ...
number of parameters: 88.22423553466797 M
test completed
fps: 226.80375376224916

vit_h_14
speedtest engaged ...
number of parameters: 632.0458374023438 M
test completed
fps: 14.448708363441009

vit_l_16
speedtest engaged ...
number of parameters: 304.3266296386719 M
test completed
fps: 35.05876075562481

vit_l_32
speedtest engaged ...
number of parameters: 306.535400390625 M
test completed
fps: 105.79344269428543

wide_resnet101_2
speedtest engaged ...
number of parameters: 126.8866958618164 M
test completed
fps: 50.2993246548358

wide_resnet50_2
speedtest engaged ...
number of parameters: 68.88323974609375 M
test completed
fps: 101.39064082574481



### Semantic Segmentation

In [5]:
segmentation_models = {
    "fcn_resnet50": models.segmentation.fcn_resnet50(weights=models.segmentation.FCN_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1),
    "fcn_resnet101": models.segmentation.fcn_resnet101(weights=models.segmentation.FCN_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1),
    "deeplabv3_resnet50": models.segmentation.deeplabv3_resnet50(weights=models.segmentation.DeepLabV3_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1),
    "deeplabv3_resnet101": models.segmentation.deeplabv3_resnet101(weights=models.segmentation.DeepLabV3_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1),
    "deeplabv3_mobilenet_v3_large": models.segmentation.deeplabv3_mobilenet_v3_large(weights=models.segmentation.DeepLabV3_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1),
    "lraspp_mobilenet_v3_large": models.segmentation.lraspp_mobilenet_v3_large(weights=models.segmentation.LRASPP_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1),
}

for model_name in segmentation_models:
    print(model_name)
    model_speedtest(segmentation_models[model_name].to("cuda"), (1, 3, 448, 448))
    print("")

fcn_resnet50
speedtest engaged ...
number of parameters: 35.32221603393555 M
test completed
fps: 30.42503202261378

fcn_resnet101
speedtest engaged ...
number of parameters: 54.3143424987793 M
test completed
fps: 18.950708659613525

deeplabv3_resnet50
speedtest engaged ...
number of parameters: 42.00407028198242 M
test completed
fps: 21.58508996294815

deeplabv3_resnet101
speedtest engaged ...
number of parameters: 60.99620056152344 M
test completed
fps: 13.971454170288638

deeplabv3_mobilenet_v3_large
speedtest engaged ...
number of parameters: 11.029328346252441 M
test completed
fps: 106.99429738664644

lraspp_mobilenet_v3_large
speedtest engaged ...
number of parameters: 3.2215380668640137 M
test completed
fps: 173.46716906285866



### Object detection

In [8]:
object_detection_models = {
    "fcos_resnet50_fpn": models.detection.fcos_resnet50_fpn(weights=models.detection.FCOS_ResNet50_FPN_Weights.COCO_V1),
    "fasterrcnn_mobilenet_v3_large_320_fpn": models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(weights=models.detection.FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.COCO_V1),
    "fasterrcnn_mobilenet_v3_large_fpn": models.detection.fasterrcnn_mobilenet_v3_large_fpn(weights=models.detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights.COCO_V1),
    "fasterrcnn_resnet50_fpn_v2": models.detection.fasterrcnn_resnet50_fpn_v2(weights=models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1),
    "fasterrcnn_resnet50_fpn": models.detection.fasterrcnn_resnet50_fpn(weights=models.detection.FasterRCNN_ResNet50_FPN_Weights.COCO_V1),
    "retinanet_resnet50_fpn_v2": models.detection.retinanet_resnet50_fpn_v2(weights=models.detection.RetinaNet_ResNet50_FPN_V2_Weights.COCO_V1),
    "retinanet_resnet50_fpn": models.detection.retinanet_resnet50_fpn(weights=models.detection.RetinaNet_ResNet50_FPN_Weights.COCO_V1),
    "ssd300_vgg16": models.detection.ssd300_vgg16(weights=models.detection.SSD300_VGG16_Weights.COCO_V1),
    "ssdlite320_mobilenet_v3_large": models.detection.ssdlite320_mobilenet_v3_large(weights=models.detection.SSDLite320_MobileNet_V3_Large_Weights.COCO_V1),
}
for model_name in object_detection_models:
    print(model_name)
    model_speedtest(object_detection_models[model_name].to("cuda"), (1, 3, 448, 448))
    print("")

### Instance Segmentation

In [2]:
instance_segmentation_models = {
    "maskrcnn_resnet50_fpn": models.detection.maskrcnn_resnet50_fpn(weights=models.detection.MaskRCNN_ResNet50_FPN_Weights.COCO_V1),
    "maskrcnn_resnet50_fpn_v2": models.detection.maskrcnn_resnet50_fpn_v2(weights=models.detection.MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1),
}
for model_name in instance_segmentation_models:
    print(model_name)
    model_speedtest(instance_segmentation_models[model_name].to("cuda"), (1, 3, 448, 448))
    print("")

maskrcnn_resnet50_fpn
speedtest engaged ...
number of parameters: 44.40139389038086 M
test completed
fps: 17.763518343754907

maskrcnn_resnet50_fpn_v2
speedtest engaged ...
number of parameters: 46.35940933227539 M
test completed
fps: 9.48244126087252



### Keypoint detection

In [None]:
keypoint_detection_models = {
    "keypointrcnn_resnet50_fpn": models.detection.keypointrcnn_resnet50_fpn(weights=models.detection.KeypointRCNN_ResNet50_FPN_Weights.COCO_V1),
}
for model_name in keypoint_detection_models:
    print(model_name)
    model_speedtest(keypoint_detection_models[model_name].to("cuda"), (1, 3, 224, 224))
    print("")

## Ultralytics YOLOv8

In [6]:
from ultralytics import YOLO

In [17]:
yolov8_model = YOLO("yolov8n-pose.pt").to("cuda")

In [14]:
time_ = 0

for _ in range(500):
    frame = T.rand(1, 3, 640, 640).to("cuda")

    dt = time.time()
    
    yolov8_model.track(frame, persist=True)
    T.cuda.synchronize()

    time_ += (time.time() - dt)


0: 640x640 (no detections), 68.0ms
Speed: 0.0ms preprocess, 68.0ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 19.0ms
Speed: 0.0ms preprocess, 19.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 16.0ms
Speed: 0.0ms preprocess, 16.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 15.0ms
Speed: 0.0ms preprocess, 15.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 15.0ms
Speed: 0.0ms preprocess, 15.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 14.0ms
Speed: 0.0ms preprocess, 14.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 14.0ms
Speed: 0.0ms preprocess, 14.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 14.0ms
Speed: 0.0ms preprocess, 14.0ms i

error: OpenCV(4.8.0) :-1: error: (-5:Bad argument) in function 'cvtColor'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'


In [15]:
1 / (time_ / 500)

233.60752658527196

## Facer

In [19]:
import facer

In [35]:
facer_models = {
    # "retinaface/mobilenet": facer.face_detector("retinaface/mobilenet", device="cuda"),
    # "retinaface/resnet": facer.face_detector("retinaface/resnet50", device="cuda"),
    "farl/lapa/448": facer.face_parser("farl/lapa/448", device="cuda"),
    "farl/celebm/448": facer.face_parser("farl/celebm/448", device="cuda"),
    "farl/ibug300w/448": facer.face_aligner("farl/ibug300w/448", device="cuda"),
    "farl/aflw19/448": facer.face_aligner("farl/aflw19/448", device="cuda"),
    "farl/wflw/448": facer.face_aligner("farl/wflw/448", device="cuda"),
    "farl/celeba/224": facer.face_attr("farl/celeba/224", device="cuda")
}