#Downloads

In [2]:
!mkdir /content/mae_temp
!git clone https://github.com/OpenGVLab/VideoMAEv2 /content/mae_temp
!cp -rn /content/mae_temp/* /content/
!rm -rf /content/mae_temp

from google.colab import drive
drive.mount('/content/drive')

Cloning into '/content/mae_temp'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 133 (delta 26), reused 18 (delta 16), pack-reused 86[K
Receiving objects: 100% (133/133), 990.61 KiB | 3.09 MiB/s, done.
Resolving deltas: 100% (63/63), done.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install av
!pip install decord
!pip install deepspeed
!pip install einops
!pip install matplotlib
!pip install mpi4py
!pip install numpy
!pip install opencv-python
!pip install pandas
!pip install Pillow
!pip install scipy
!pip install tensorboard==2.9.0
!pip install tensorboardX==1.8
!pip install timm==0.4.12
!pip install torch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1
!pip install triton==1.0.0
!pip install utils

Collecting av
  Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-11.0.0
Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: decord
Successfully installed decord-0.6.0
Collecting deepspeed
  Downloading deepspeed-0.13.2.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hjson (from deepspeed)
  Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31

#Finetuning

In [None]:
!unzip -u "/content/drive/My Drive/k710_list.zip" -d "/content/data"
DATA_PATH = "/content/data/k710"

MODEL_PATH = '/content/drive/MyDrive/vit_g_hybrid_pt_1200e_k710_ft.pth'

!mkdir "/content/output"
OUTPUT_DIR = "content/output"

Archive:  /content/drive/My Drive/k710_list.zip
mkdir: cannot create directory ‘/content/output’: File exists


In [None]:
from argparse import Namespace

!python run_class_finetuning.py \
    --model vit_giant_patch14_224 \
    --data_set Kinetics-710 \
    --nb_classes 710 \
    --data_path $DATA_PATH \
    --finetune $MODEL_PATH \
    --log_dir $OUTPUT_DIR \
    --output_dir $OUTPUT_DIR \
    --batch_size 3 \
    --input_size 224 \
    --short_side_size 224 \
    --save_ckpt_freq 10 \
    --num_frames 16 \
    --sampling_rate 4 \
    --num_sample 2 \
    --num_workers 10 \
    --opt adamw \
    --lr 1e-3 \
    --drop_path 0.3 \
    --clip_grad 5.0 \
    --layer_decay 0.9 \
    --opt_betas 0.9 0.999 \
    --weight_decay 0.1 \
    --warmup_epochs 5 \
    --epochs 35 \
    --test_num_segment 5 \
    --test_num_crop 3 \
    --dist_eval --enable_deepspeed

[2024-01-17 01:02:42,376] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
usage: VideoMAE fine-tuning and evaluation script for action classification
       [--batch_size BATCH_SIZE] [--epochs EPOCHS] [--update_freq UPDATE_FREQ]
       [--save_ckpt_freq SAVE_CKPT_FREQ] [--model MODEL] [--tubelet_size TUBELET_SIZE]
       [--input_size INPUT_SIZE] [--with_checkpoint] [--drop PCT] [--attn_drop_rate PCT]
       [--drop_path PCT] [--head_drop_rate PCT] [--disable_eval_during_finetuning] [--model_ema]
       [--model_ema_decay MODEL_EMA_DECAY] [--model_ema_force_cpu] [--opt OPTIMIZER]
       [--opt_eps EPSILON] [--opt_betas BETA [BETA ...]] [--clip_grad NORM] [--momentum M]
       [--weight_decay WEIGHT_DECAY] [--weight_decay_end WEIGHT_DECAY_END] [--lr LR]
       [--layer_decay LAYER_DECAY] [--warmup_lr LR] [--min_lr LR] [--warmup_epochs N]
       [--warmup_steps N] [--color_jitter PCT] [--num_sample NUM_SAMPLE] [--aa NAME]
       [--smoothing 


#Model Inference

In [None]:
import argparse
from argparse import Namespace
import datetime
import json
import os
import random
import time
from collections import OrderedDict
from functools import partial
from pathlib import Path

import deepspeed
import numpy as np
import torch
import torch.backends.cudnn as cudnn
from timm.data.mixup import Mixup
from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
from timm.models import create_model
from timm.utils import ModelEma

# NOTE: Do not comment `import models`, it is used to register models
import models  # noqa: F401
import utils
from dataset import build_dataset
from engine_for_finetuning import (
    final_test,
    merge,
    train_one_epoch,
    validation_one_epoch,
)
from optim_factory import (
    LayerDecayValueAssigner,
    create_optimizer,
    get_parameter_groups,
)
from utils import NativeScalerWithGradNormCount as NativeScaler
from utils import multiple_samples_collate

args = Namespace(
    batch_size=3,
    epochs=35,
    update_freq=1,
    save_ckpt_freq=10,
    model='vit_giant_patch14_224',
    tubelet_size=2,
    input_size=224,
    with_checkpoint=False,
    drop=0.0,
    attn_drop_rate=0.0,
    drop_path=0.3,
    head_drop_rate=0.0,
    disable_eval_during_finetuning=False,
    model_ema=False,
    model_ema_decay=0.9999,
    model_ema_force_cpu=False,
    opt='adamw',
    opt_eps=1e-08,
    opt_betas=[0.9, 0.999],
    clip_grad=5.0,
    momentum=0.9,
    weight_decay=0.1,
    weight_decay_end=None,
    lr=0.001,
    layer_decay=0.9,
    warmup_lr=1e-08,
    min_lr=1e-06,
    warmup_epochs=5,
    warmup_steps=-1,
    color_jitter=0.4,
    num_sample=2,
    aa='rand-m7-n4-mstd0.5-inc1',
    smoothing=0.1,
    train_interpolation='bicubic',
    crop_pct=None,
    short_side_size=224,
    test_num_segment=5,
    test_num_crop=3,
    reprob=0.25,
    remode='pixel',
    recount=1,
    resplit=False,
    mixup=0.8,
    cutmix=1.0,
    cutmix_minmax=None,
    mixup_prob=1.0,
    mixup_switch_prob=0.5,
    mixup_mode='batch',
    finetune='/content/drive/MyDrive/vit_g_hybrid_pt_1200e_k710_ft.pth',
    model_key='model|module',
    model_prefix='',
    init_scale=0.001,
    use_mean_pooling=True,
    data_path='/content/data/k710',
    data_root='',
    eval_data_path=None,
    nb_classes=710,
    imagenet_default_mean_and_std=True,
    num_segments=1,
    num_frames=16,
    sampling_rate=4,
    sparse_sample=False,
    data_set='Kinetics-710',
    fname_tmpl='img_{:05}.jpg',
    start_idx=1,
    output_dir='content/output',
    log_dir='content/output',
    device='cuda',
    seed=0,
    resume='',
    auto_resume=True,
    save_ckpt=True,
    start_epoch=0,
    eval=False,
    validation=False,
    dist_eval=True,
    num_workers=10,
    pin_mem=True,
    world_size=1,
    local_rank=-1,
    dist_on_itp=False,
    dist_url='env://',
    enable_deepspeed=True,
    deepspeed=False,
    deepspeed_config='content/output/deepspeed_config.json',
    deepscale=False,
    deepscale_config=None,
    deepspeed_mpi=False,
    distributed=False
  )

[2024-02-19 23:53:40,124] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
trained_model = create_model(
    args.model,
    img_size=args.input_size,
    pretrained=False,
    num_classes=args.nb_classes,
    all_frames=args.num_frames * args.test_num_segment,
    tubelet_size=args.tubelet_size,
    drop_rate=args.drop,
    drop_path_rate=args.drop_path,
    attn_drop_rate=args.attn_drop_rate,
    head_drop_rate=args.head_drop_rate,
    drop_block_rate=None,
    use_mean_pooling=args.use_mean_pooling,
    init_scale=args.init_scale,
    with_cp=args.with_checkpoint,
  )

Stats

In [None]:
n_parameters = sum(p.numel() for p in trained_model.parameters() if p.requires_grad)
n_layers = trained_model.get_num_layers()

print(f'num parameters: {n_parameters}')
print(f'num layers: {n_layers}')

num parameters: 1012611142
num layers: 40


In [None]:
checkpoint_model = torch.load(args.finetune, map_location='cpu')
checkpoint_model = checkpoint_model["module"]
state_dict = trained_model.state_dict()

for old_key in list(checkpoint_model.keys()):
  if old_key.startswith('_orig_mod.'):
    new_key = old_key[10:]
    checkpoint_model[new_key] = checkpoint_model.pop(old_key)

all_keys = list(checkpoint_model.keys())
new_dict = OrderedDict()
for key in all_keys:
  if key.startswith('backbone.'):
    new_dict[key[9:]] = checkpoint_model[key]
  elif key.startswith('encoder.'):
    new_dict[key[8:]] = checkpoint_model[key]
  else:
    new_dict[key] = checkpoint_model[key]
checkpoint_model = new_dict

In [None]:
utils.load_state_dict(
    trained_model, checkpoint_model, prefix=args.model_prefix)

optimizer = create_optimizer(
    args,
    trained_model,
    skip_list=trained_model.no_weight_decay(),
    get_num_layer=None,
    get_layer_scale=None
  )

loss_scaler = NativeScaler()

utils.auto_load_model(
    args=args,
    model=trained_model,
    model_without_ddp=trained_model,
    optimizer=optimizer,
    loss_scaler=loss_scaler,
    model_ema=None
  )

trained_model

Param groups = {
  "decay": {
    "weight_decay": 0.1,
    "params": [
      "patch_embed.proj.weight",
      "blocks.0.attn.qkv.weight",
      "blocks.0.attn.proj.weight",
      "blocks.0.mlp.fc1.weight",
      "blocks.0.mlp.fc2.weight",
      "blocks.1.attn.qkv.weight",
      "blocks.1.attn.proj.weight",
      "blocks.1.mlp.fc1.weight",
      "blocks.1.mlp.fc2.weight",
      "blocks.2.attn.qkv.weight",
      "blocks.2.attn.proj.weight",
      "blocks.2.mlp.fc1.weight",
      "blocks.2.mlp.fc2.weight",
      "blocks.3.attn.qkv.weight",
      "blocks.3.attn.proj.weight",
      "blocks.3.mlp.fc1.weight",
      "blocks.3.mlp.fc2.weight",
      "blocks.4.attn.qkv.weight",
      "blocks.4.attn.proj.weight",
      "blocks.4.mlp.fc1.weight",
      "blocks.4.mlp.fc2.weight",
      "blocks.5.attn.qkv.weight",
      "blocks.5.attn.proj.weight",
      "blocks.5.mlp.fc1.weight",
      "blocks.5.mlp.fc2.weight",
      "blocks.6.attn.qkv.weight",
      "blocks.6.attn.proj.weight",
      "blocks.6.m

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 1408, kernel_size=(2, 14, 14), stride=(2, 14, 14))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1408, out_features=4224, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1408, out_features=1408, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1408, out_features=6144, bias=True)
        (act): GELU(approximate=none)
        (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
  

Inference with Embedding Output

In [None]:
def embedding_output(trained_model, images):
  trained_model.eval()

  output = trained_model.patch_embed(images)
  output = trained_model.pos_drop(output)

  for m in trained_model.blocks:
    output = m(output)

  return output.flatten()

images = torch.zeros([1, 3, 80, 224, 224])
embedding_output(trained_model, images)

KeyboardInterrupt: 

Inference with Score Output

In [None]:
trained_model.eval()

images = torch.zeros([1, 3, 80, 224, 224])
with torch.cuda.amp.autocast():
  output = trained_model(images)

NameError: name 'trained_model' is not defined