# 0. imports

In [1]:
import yaml
import pathlib
import time

import torch
import torch.nn

import mnn.vision.image_size
import mnn.vision.models.vision_transformer.encoder.config as mnn_encoder_config
import mnn.vision.config as mnn_config
from mnn.vision.models.vision_transformer.e2e import MyVisionTransformer
from mnn.vision.models.vision_transformer.tasks.object_detection import (
    ObjectDetectionOrdinalHead,
)

# 1. UTILITIES


In [2]:
def inference_test(image: torch.Tensor, model: torch.nn.Module):
    t0 = time.time()
    output = model(image)
    t1 = time.time()
    print("Time taken:", t1 - t0, "seconds")
    print("Model's output shape:", output.shape)
    traced_model = torch.jit.trace(model.forward, image, check_trace=True, strict=True)
    return traced_model


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def read_yaml_file(file_path: pathlib.Path) -> dict:
    with file_path.open(mode="r") as f:
        # Python 3.11 need Loader
        return yaml.load(f, Loader=yaml.FullLoader)

# 3. MODEL DEFINITION


## 3.1 Configuration

In [8]:
""" CONFIGURATION """
def load_model_config(yaml_path: pathlib.Path):
    model_config_as_dict = read_yaml_file(yaml_path)
    model_config = mnn_encoder_config.MyBackboneVitConfiguration.from_dict(
        model_config_as_dict["network"]["backbone"]
    )
    encoder_config = model_config.encoder_config
    head_config = mnn_encoder_config.VisionTransformerEncoderConfiguration.from_dict(
        model_config_as_dict["network"]["head"]["VisionTransformerHead"]
    )
    return model_config, encoder_config, head_config

def load_hyperparameters_config(yaml_path: pathlib.Path):
    hyperparameters_config_as_dict = read_yaml_file(yaml_path)
    hyperparameters_config = mnn_config.HyperparametersConfiguration.from_dict(hyperparameters_config_as_dict)
    return hyperparameters_config


model_config, encoder_config, head_config = load_model_config(pathlib.Path("model.yaml"))
hyperparameters_config = load_hyperparameters_config(pathlib.Path("hyperparameters.yaml"))

batch_size = hyperparameters_config.batch_size
embedding_size = model_config.rgb_combinator_config.d_model
sequence_length = model_config.rgb_combinator_config.feed_forward_dimensions
image_size = mnn.vision.image_size.ImageSize(width=embedding_size, height=sequence_length)

hidden_dim = embedding_size
image_RGB = torch.rand(batch_size, 3, image_size.height, image_size.width) * 255


## 3.2 NETWORK DEFINITION

In [4]:
class VitObjectDetectionNetwork(torch.nn.Module):

    def __init__(
        self,
        model_config: mnn_encoder_config.MyBackboneVitConfiguration,
        head_config: mnn_encoder_config.VisionTransformerEncoderConfiguration,
    ):
        super().__init__()
        expected_image_width = model_config.encoder_config.d_model
        expected_image_height = model_config.encoder_config.feed_forward_dimensions
        self.expected_image_size = mnn.vision.image_size.ImageSize(
            width=expected_image_width, height=expected_image_height
        )
        self.encoder = MyVisionTransformer(model_config, image_size)
        self.head = ObjectDetectionOrdinalHead(config=head_config)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.encoder(x)
        x = self.head(x)
        return x


import mnn.vision.dataset.utilities
import mnn.vision.models.heads.object_detection


## 3.3 MODEL UTILITIES

In [5]:
object_detection_model = VitObjectDetectionNetwork(
    model_config=model_config,
    head_config=head_config
)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

object_detection_model.to(device=device, dtype=hyperparameters_config.floating_point_precision)

VitObjectDetectionNetwork(
  (encoder): MyVisionTransformer(
    (rgb_combinator): RGBCombinator(
      (encoder): RawVisionTransformerRGBEncoder(
        (multi_channels_encoder): ModuleList(
          (0-2): 3 x RawVisionTransformerEncoder(
            (positional_encoder): MyVisionPositionalEncoding()
            (encoder_block): TransformerEncoder(
              (layers): ModuleList(
                (0): TransformerEncoderLayer(
                  (self_attn): MultiheadAttention(
                    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
                  )
                  (linear1): Linear(in_features=512, out_features=720, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                  (linear2): Linear(in_features=720, out_features=512, bias=True)
                  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=Tru

# 4. TRAIN

## 4.1 Load Data

In [6]:
import mnn.vision.dataset.coco.loader
import pathlib

dataset_dir = pathlib.Path("/home/emvasilopoulos/projects/ml_interview_ready_code/data/coco/")

train_dataset = mnn.vision.dataset.coco.loader.COCODatasetInstances2017(dataset_dir, "train", object_detection_model.expected_image_size)
val_dataset = mnn.vision.dataset.coco.loader.COCODatasetInstances2017(dataset_dir, "val", object_detection_model.expected_image_size)

In [9]:
train_loader = torch.utils.data.DataLoader(val_dataset, batch_size=hyperparameters_config.batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=hyperparameters_config.batch_size, shuffle=True)

In [10]:
def train_one_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: VitObjectDetectionNetwork,
    optimizer: torch.optim.Optimizer,
    loss_fn: torch.nn.Module,
    hyperparameters_config: mnn_config.HyperparametersConfiguration,
) -> None:
    model.train()  # important for batch normalization and dropout layers
    for i, (image_batch, target0) in enumerate(train_loader):
        image_batch = image_batch.to(
            device=device, dtype=hyperparameters_config.floating_point_precision
        )
        target0 = target0.to(
            device=device, dtype=hyperparameters_config.floating_point_precision
        )
        
        # Zero your gradients for every batch!
        optimizer.zero_grad()
        
        # Forward pass
        output = model(image_batch)

        # Compute the loss and its gradients
        loss = loss_fn(output, target0)
        loss.backward()
        
        # Adjust learning weights
        optimizer.step()
        if i % 10 == 0:
            print(f"Step {i}, loss: {loss.item()}")


def val_once(val_loader, model, loss_fn, hyperparameters_config):
    model.eval()
    with torch.no_grad():
        for i, (image_batch, target0) in enumerate(val_loader):
            image_batch = image_batch.to(
                device=device, dtype=hyperparameters_config.floating_point_precision
            )
            target0 = target0.to(
                device=device, dtype=hyperparameters_config.floating_point_precision
            )
            output = model(image_batch)
            loss = loss_fn(output, target0)
            if i % 10 == 0:
                print(f"Validation step {i}, loss: {loss.item()}")


optimizer = torch.optim.Adam(
    object_detection_model.parameters(), lr=hyperparameters_config.learning_rate
)
loss_fn = torch.nn.BCELoss()
for epoch in range(hyperparameters_config.epochs):

    train_one_epoch(
        train_loader, object_detection_model, optimizer, loss_fn, hyperparameters_config
    )
    torch.save(object_detection_model.state_dict(), "exp1_object_detection.pth")
    val_once(val_loader, object_detection_model, loss_fn, hyperparameters_config)

Step 0, loss: 0.8032336831092834
Step 10, loss: 0.7492992877960205
Step 20, loss: 0.7297059297561646
Step 30, loss: 0.7152820229530334
Step 40, loss: 0.7098309993743896
Step 50, loss: 0.7016305923461914
Step 60, loss: 0.6968216300010681
Step 70, loss: 0.6901659965515137
Step 80, loss: 0.6797191500663757
Step 90, loss: 0.6756447553634644


IndexError: index 425 is out of bounds for dimension 0 with size 425

## 5.1 INFERENCE PROFILING


In [None]:
import mnn.visualize
import time

with torch.no_grad():
    for _ in range(10):
        image_size = object_detection_model.expected_image_size
        image_RGB = torch.rand(hyperparameters_config.batch_size, 3, image_size.height, image_size.width) * 255
        image_RGB = image_RGB.to(device=device, dtype=hyperparameters_config.floating_point_precision)
        t0 = time.time()
        output = object_detection_model(image_RGB)
        out = output.detach().cpu().numpy()
        t1 = time.time()
        print("Time taken:", t1 - t0, "seconds | image_shape:", image_RGB.shape, "output_shape:", output.shape)

Time taken: 0.1274416446685791 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time taken: 0.04546070098876953 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time taken: 0.04407024383544922 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time taken: 0.04363131523132324 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time taken: 0.038485050201416016 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time taken: 0.035089731216430664 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time taken: 0.034650564193725586 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time taken: 0.03783130645751953 seconds | image_shape: torch.Size([4, 3, 720, 512]) output_shape: torch.Size([4, 720, 512])
Time t

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        object_detection_model(image_RGB)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us      39.239ms        55.23%      39.239ms      39.239ms             1  
                                        model_inference        24.98%      10.571ms        88.90%      37.622ms      37.622ms       0.000us         0.00%      33.072ms      33.072ms             1  
         

STAGE:2024-09-29 18:44:05 6925:6925 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-09-29 18:44:05 6925:6925 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-09-29 18:44:05 6925:6925 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


## 5.2 MODEL VISUALIZATION

In [None]:
mnn.visualize.make_dot(
    output, params=dict(object_detection_model.named_parameters())
).render("my_transformer", format="png")

'my_transformer.png'