# 0. imports

In [1]:
import yaml
import pathlib
import time

import torch
import torch.nn

import mnn.vision.image_size
import mnn.vision.models.vision_transformer.encoder.config as mnn_encoder_config
import mnn.vision.config as mnn_config
from mnn.vision.models.vision_transformer.e2e import MyVisionTransformer
from mnn.vision.models.vision_transformer.tasks.object_detection import (
    ObjectDetectionOrdinalHead,
)

# 1. UTILITIES


In [2]:
def inference_test(image: torch.Tensor, model: torch.nn.Module):
    t0 = time.time()
    output = model(image)
    t1 = time.time()
    print("Time taken:", t1 - t0, "seconds")
    print("Model's output shape:", output.shape)
    traced_model = torch.jit.trace(model.forward, image, check_trace=True, strict=True)
    return traced_model


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def read_yaml_file(file_path: pathlib.Path) -> dict:
    with file_path.open(mode="r") as f:
        # Python 3.11 need Loader
        return yaml.load(f, Loader=yaml.FullLoader)

# 3. MODEL DEFINITION


## 3.1 Configuration

In [3]:
""" CONFIGURATION """
def load_model_config(yaml_path: pathlib.Path):
    model_config_as_dict = read_yaml_file(yaml_path)
    model_config = mnn_encoder_config.MyBackboneVitConfiguration.from_dict(
        model_config_as_dict["network"]["backbone"]
    )
    encoder_config = model_config.encoder_config
    head_config = mnn_encoder_config.VisionTransformerEncoderConfiguration.from_dict(
        model_config_as_dict["network"]["head"]["VisionTransformerHead"]
    )
    return model_config, encoder_config, head_config

def load_hyperparameters_config(yaml_path: pathlib.Path):
    hyperparameters_config_as_dict = read_yaml_file(yaml_path)
    hyperparameters_config = mnn_config.HyperparametersConfiguration.from_dict(hyperparameters_config_as_dict)
    return hyperparameters_config


model_config, encoder_config, head_config = load_model_config(pathlib.Path("model.yaml"))
hyperparameters_config = load_hyperparameters_config(pathlib.Path("hyperparameters.yaml"))

batch_size = hyperparameters_config.batch_size
embedding_size = model_config.rgb_combinator_config.d_model
sequence_length = model_config.rgb_combinator_config.feed_forward_dimensions
image_size = mnn.vision.image_size.ImageSize(width=embedding_size, height=sequence_length)

hidden_dim = embedding_size
image_RGB = torch.rand(batch_size, 3, image_size.height, image_size.width) * 255


## 3.2 NETWORK DEFINITION

In [32]:
class VitObjectDetectionNetwork(torch.nn.Module):

    def __init__(
        self,
        model_config: mnn_encoder_config.MyBackboneVitConfiguration,
        head_config: mnn_encoder_config.VisionTransformerEncoderConfiguration,
        image_size: mnn.vision.image_size.ImageSize,
    ):
        super().__init__()
        expected_image_width = model_config.encoder_config.d_model
        expected_image_height = -1
        self.expected_image_size = mnn.vision.image_size.ImageSize(
            width=expected_image_width, height=expected_image_height
        )
        self.encoder = MyVisionTransformer(model_config, image_size)
        self.head = ObjectDetectionOrdinalHead(config=head_config)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.encoder(x)
        x = self.head(x)
        return x


import mnn.vision.dataset.utilities
import mnn.vision.models.heads.object_detection


def preprocess_image(x: torch.Tensor, expected_image_width: int) -> torch.Tensor:
    """
    Expecting tensors of shape (3, H, W)
    """
    print("Incoming image shape:", x.shape)
    # 1 - Normalization
    x = x / 255.0

    # 2 - Resize
    x_w = x.shape[2]
    x_h = x.shape[1]
    if x_w != expected_image_width:
        # resize down and keep ratio
        ratio = x_w / x_h
        new_height = int(expected_image_width / ratio)
        x = x.unsqueeze(0)
        x = torch.nn.functional.interpolate(
            x,
            size=(new_height, expected_image_width),
            mode="bilinear",
            align_corners=False,
        )
        x = x.squeeze(0)

    print("Outgoing image shape:", x.shape)
    return x


def bbox_annotation_to_mask(y: torch.Tensor, mask_shape: torch.Size) -> torch.Tensor:
    masks = []
    for i in range(y.shape[0]):
        masks.append(
            mnn.vision.models.heads.object_detection.ObjectDetectionOrdinalTransformation.transform_ground_truth_from_normalized_rectangles(
                mask_shape, y[i, :]
            )
        )
    return torch.stack(masks)

## 3.3 MODEL UTILITIES

In [14]:
object_detection_model = VitObjectDetectionNetwork(
    model_config=model_config,
    head_config=head_config,
    image_size=image_size,
)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

object_detection_model.to(device=device, dtype=hyperparameters_config.floating_point_precision)

VitObjectDetectionNetwork(
  (encoder): MyVisionTransformer(
    (rgb_combinator): RGBCombinator(
      (encoder): RawVisionTransformerRGBEncoder(
        (multi_channels_encoder): ModuleList(
          (0-2): 3 x RawVisionTransformerEncoder(
            (positional_encoder): MyVisionPositionalEncoding()
            (encoder_block): TransformerEncoder(
              (layers): ModuleList(
                (0): TransformerEncoderLayer(
                  (self_attn): MultiheadAttention(
                    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
                  )
                  (linear1): Linear(in_features=512, out_features=1024, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                  (linear2): Linear(in_features=1024, out_features=512, bias=True)
                  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=T

## 3.4 INFERENCE PROFILING


In [16]:
import mnn.visualize
import time

with torch.no_grad():
    for _ in range(10):
        image_RGB = torch.rand(hyperparameters_config.batch_size, 3, image_size.height, image_size.width) * 255
        image_RGB = image_RGB.to(device=device, dtype=hyperparameters_config.floating_point_precision)
        t0 = time.time()
        output = object_detection_model(image_RGB)
        out = output.detach().cpu().numpy()
        t1 = time.time()
        print("Time taken:", t1 - t0, "seconds | image_shape:", image_RGB.shape, "output_shape:", output.shape)

Time taken: 0.17861294746398926 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1, 1024, 512])
Time taken: 0.019279003143310547 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1, 1024, 512])
Time taken: 0.019176244735717773 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1, 1024, 512])
Time taken: 0.02065300941467285 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1, 1024, 512])
Time taken: 0.021900415420532227 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1, 1024, 512])
Time taken: 0.02014613151550293 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1, 1024, 512])
Time taken: 0.020149946212768555 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1, 1024, 512])
Time taken: 0.020151376724243164 seconds | image_shape: torch.Size([1, 3, 1024, 512]) output_shape: torch.Size([1,

In [17]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        object_detection_model(image_RGB)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us      22.043ms        55.24%      22.043ms      22.043ms             1  
                                        model_inference        34.17%       9.041ms        84.34%      22.314ms      22.314ms       0.000us         0.00%      18.221ms      18.221ms             1  
         

STAGE:2024-09-28 23:16:26 226568:226568 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-09-28 23:16:26 226568:226568 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-09-28 23:16:26 226568:226568 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


## 3.5 MODEL VISUALIZATION

In [None]:
mnn.visualize.make_dot(
    output, params=dict(object_detection_model.named_parameters())
).render("my_transformer", format="png")

# 4. TRAIN

## 4.1 Load Data

In [6]:
import torch
import mnn.vision.dataset.coco.loader
import pathlib

dataset_dir = pathlib.Path("/home/emvasilopoulos/projects/ml_interview_ready_code/data/coco/")

train_dataset = mnn.vision.dataset.coco.loader.COCODatasetInstances2017(dataset_dir, "train")
val_dataset = mnn.vision.dataset.coco.loader.COCODatasetInstances2017(dataset_dir, "val")

In [7]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=hyperparameters_config.batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=hyperparameters_config.batch_size, shuffle=True)

In [31]:
def train_one_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: VitObjectDetectionNetwork,
    optimizer: torch.optim.Optimizer,
    loss_fn: torch.nn.Module,
    hyperparameters_config: mnn_config.HyperparametersConfiguration,
) -> None:
    model.train()  # important for batch normalization and dropout layers
    for i, (image_batch, target0, target1) in enumerate(train_loader):
        image_batch = torch.stack([preprocess_image(image_batch[i], expected_image_width=model.expected_image_size.width) for i in range(image_batch.shape[0])])
        image_batch = image_batch.to(
            device=device, dtype=hyperparameters_config.floating_point_precision
        )
        print(target0.shape)
        image_height = image_batch.shape[2] # it could be resized
        target0 = bbox_annotation_to_mask(target0, torch.Size((image_height, model.expected_image_size.width)))
        target0 = target0.to(
            device=device, dtype=hyperparameters_config.floating_point_precision
        )
        
        optimizer.zero_grad()
        output = model(image_batch)
        loss = loss_fn(output, target0)
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            print(f"Step {i}, loss: {loss.item()}")

def val_once(val_loader, model, loss_fn, hyperparameters_config):
    model.eval()
    with torch.no_grad():
        for i, (image, target0, target1) in enumerate(val_loader):
            image = preprocess_image(image, expected_image_width=model.expected_image_size.width)
            image = image.to(
                device=device, dtype=hyperparameters_config.floating_point_precision
            )
            image_height = image.shape[2] # it could be resized
            print(target0.shape)
            target0 = bbox_annotation_to_mask(target0, torch.Size((image_height, model.expected_image_size.width)))
            target0 = target0.to(
                device=device, dtype=hyperparameters_config.floating_point_precision
            )

            output = model(image)
            loss = loss_fn(output, target0, target1)
            if i % 10 == 0:
                print(f"Validation step {i}, loss: {loss.item()}")

optimizer = torch.optim.Adam(
    object_detection_model.parameters(), lr=hyperparameters_config.learning_rate
)
loss_fn = torch.nn.BCELoss()
for epoch in range(hyperparameters_config.epochs):

    train_one_epoch(train_loader, object_detection_model, optimizer, loss_fn, hyperparameters_config)
    torch.save(object_detection_model.state_dict(), "exp1_object_detection.pth")
    val_once(val_loader, object_detection_model, loss_fn, hyperparameters_config)

torch.Size([1, 35, 4])


RuntimeError: The size of tensor a (340) must match the size of tensor b (1024) at non-singleton dimension 1