<a href="https://colab.research.google.com/github/ericbanzuzi/Thesis-DL-LC/blob/main/notebooks/model_architectures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
from pathlib import Path

drive.mount('/content/drive', force_remount=True)
base = Path('/content/drive/MyDrive/thesis')  # working directory 

Mounted at /content/drive


Get the git repo to colab through a zip file to get faster performance

In [2]:
zip_path = base/'Thesis-DL-LC-main.zip'
!cp '{zip_path}' .
!unzip -q Thesis-DL-LC-main.zip
!rm Thesis-DL-LC-main.zip

In [3]:
%cd '/content/Thesis-DL-LC-main'

/content/Thesis-DL-LC-main


In [4]:
# imports
!pip install torchinfo
from torchinfo import summary
from models.models import R2Plus1D, MC4, ViViT

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.2


In [5]:
# This project's video inputs
BATCH = 16
HEIGHT = 112
WIDTH = 112
FRAMES = 32

## **R2Plus1D:**

In [6]:
r2plus1d = R2Plus1D(num_classes=3)
summary(model=r2plus1d,
        input_size=(BATCH, 3, FRAMES, HEIGHT, WIDTH), # (batch_size, color_channels, frames, height, width)
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type (var_name))                       Input Shape          Output Shape         Param #              Trainable
R2Plus1D (R2Plus1D)                           [16, 3, 32, 112, 112] [16, 3]              --                   True
├─Conv2Plus1DFirst (conv1)                    [16, 3, 32, 112, 112] [16, 64, 32, 56, 56] --                   True
│    └─Conv3d (0)                             [16, 3, 32, 112, 112] [16, 45, 32, 56, 56] 6,660                True
│    └─BatchNorm3d (1)                        [16, 45, 32, 56, 56] [16, 45, 32, 56, 56] 90                   True
│    └─ReLU (2)                               [16, 45, 32, 56, 56] [16, 45, 32, 56, 56] --                   --
│    └─Conv3d (3)                             [16, 45, 32, 56, 56] [16, 64, 32, 56, 56] 8,704                True
│    └─BatchNorm3d (4)                        [16, 64, 32, 56, 56] [16, 64, 32, 56, 56] 128                  True
├─ReLU (relu1)                                [16, 64, 32, 56, 56] [16, 64, 32, 56

## **MC4:**

In [7]:
mc4 = MC4(num_classes=3)
summary(model=mc4,
        input_size=(BATCH, 3, FRAMES, HEIGHT, WIDTH), # (batch_size, color_channels, frames, height, width)
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
MC4 (MC4)                                [16, 3, 32, 112, 112] [16, 3]              --                   True
├─Sequential (conv1)                     [16, 3, 32, 112, 112] [16, 64, 32, 56, 56] --                   True
│    └─Conv3d (0)                        [16, 3, 32, 112, 112] [16, 64, 32, 56, 56] 28,288               True
│    └─BatchNorm3d (1)                   [16, 64, 32, 56, 56] [16, 64, 32, 56, 56] 128                  True
├─ReLU (relu1)                           [16, 64, 32, 56, 56] [16, 64, 32, 56, 56] --                   --
├─Conv3DResidualBlock (conv2_1)          [16, 64, 32, 56, 56] [16, 64, 32, 56, 56] --                   True
│    └─Sequential (seq)                  [16, 64, 32, 56, 56] [16, 64, 32, 56, 56] --                   True
│    │    └─Conv3d (0)                   [16, 64, 32, 56, 56] [16, 64, 32, 56, 56] 110,656              True
│    │    └─B

## **ViViT:**

In [8]:
vivit = ViViT(num_transformer_layers=8, num_heads=8, num_classes=3)
summary(model=vivit,
        input_size=(BATCH, 3, FRAMES, HEIGHT, WIDTH), # (batch_size, color_channels, frames, height, width)
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
ViViT (ViViT)                                                [16, 3, 32, 112, 112] [16, 3]              --                   True
├─TubeletEmbedding (patch_embedding)                         [16, 3, 32, 112, 112] [16, 784, 768]       --                   True
│    └─Conv3d (patcher)                                      [16, 3, 32, 112, 112] [16, 768, 16, 7, 7]  1,180,416            True
│    └─Flatten (flatten)                                     [16, 768, 16, 7, 7]  [16, 768, 784]       --                   --
├─PositionalEncoder (position_embedding)                     [16, 3, 32, 112, 112] [16, 784, 768]       --                   --
├─Dropout (embedding_dropout)                                [16, 784, 768]       [16, 784, 768]       --                   --
├─Sequential (transformer_encoder)                           [16, 784, 768]       [16, 784, 76