In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
%cd drive/'My Drive'/action_recognition/computervision-recipes/contrib/action_recognition/r2p1d

/content/drive/My Drive/action_recognition/computervision-recipes/contrib/action_recognition/r2p1d


In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
!pip install decord



In [0]:
!pip install einops



In [0]:
import os
import time
import sys

import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.cuda as cuda
import torch.nn as nn
import torchvision

from vu.data import show_batch, VideoDataset
from vu.models.r2plus1d import R2Plus1D 
from vu.utils import system_info

system_info()

3.6.9 (default, Apr 18 2020, 01:56:04) 
[GCC 8.4.0] 

PyTorch 1.5.0+cu101 

Torch-vision 0.6.0+cu101 

Available devices:
0: Tesla P100-PCIE-16GB


#**Model configurations**

In [0]:
DATA_ROOT = os.path.join("data", "afl")
VIDEO_DIR = os.path.join(DATA_ROOT, "videos")
TRAIN_SPLIT = os.path.join(DATA_ROOT, "afl_vid_train_split.txt")
TEST_SPLIT = os.path.join(DATA_ROOT, "afl_vid_val_split.txt")

In [0]:
MODEL_INPUT_SIZE = 32
BATCH_SIZE = 4

r2plus1d_cfgs = dict(
    num_classes=4,
    video_dir=VIDEO_DIR,
    train_split=TRAIN_SPLIT,
    valid_split=TEST_SPLIT,
    base_model='ig65m',
    sample_length=MODEL_INPUT_SIZE,     
    sample_step=1,       
    im_scale=128,         
    mean=(0.43216, 0.394666, 0.37645),
    std=(0.22803, 0.22145, 0.216989),
    random_shift=True,
    temporal_jitter_step=2,   
    flip_ratio=0.5,
    random_crop=True,
    video_ext='mp4',
)

train_cfgs = dict(
    mixed_prec=False,
    batch_size=BATCH_SIZE,
    grad_steps=2,
    lr=0.001,        
    momentum=0.95,
    warmup_pct=0.3,  
    lr_decay_factor=0.001,
    weight_decay=0.0001,
    epochs=48, 
    model_name='afl',
    model_dir=os.path.join("checkpoints", "ig65m_afl"),
)

In [0]:
learn = R2Plus1D(r2plus1d_cfgs)

Loading r2plus1d_34_32_ig65m model


Using cache found in /root/.cache/torch/hub/moabitcoin_ig65m-pytorch_master


In [0]:
learn.model

VideoResNet(
  (stem): R2Plus1dStem(
    (0): Conv3d(3, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
    (1): BatchNorm3d(45, eps=0.001, momentum=0.9, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv3d(45, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
    (4): BatchNorm3d(64, eps=0.001, momentum=0.9, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv2Plus1D(
          (0): Conv3d(64, 144, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
          (1): BatchNorm3d(144, eps=0.001, momentum=0.9, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
          (3): Conv3d(144, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
        )
        (1): BatchNorm3d(64, eps=0.001, momentum=0.9, affine=True, track_running_stats=Tru

In [0]:
learn.show_batch(num_samples=3)

Output hidden; open in https://colab.research.google.com to view.

#**Train model**

In [0]:
learn.fit(train_cfgs)

Params to learn:
	full network
lr=[3.9999999999999996e-05]


  "See the documentation of nn.Upsample for details.".format(mode))


train took 72.63 sec: loss = 1.4304, top1_acc = 30.9524, top5_acc = 73.8095
valid took 20.65 sec: loss = 1.4532, top1_acc = 22.2222, top5_acc = 72.2222
lr=[5.313140544408986e-05]
train took 43.82 sec: loss = 1.3948, top1_acc = 33.3333, top5_acc = 80.9524
valid took 21.02 sec: loss = 1.4294, top1_acc = 22.2222, top5_acc = 72.2222
lr=[9.180714757245454e-05]
train took 43.76 sec: loss = 1.3399, top1_acc = 40.4762, top5_acc = 78.5714
valid took 20.69 sec: loss = 1.3586, top1_acc = 33.3333, top5_acc = 77.7778
lr=[0.00015391111450701667]
train took 42.94 sec: loss = 1.2324, top1_acc = 50.0000, top5_acc = 80.9524
valid took 20.41 sec: loss = 1.2450, top1_acc = 50.0000, top5_acc = 94.4444
lr=[0.00023604533804150933]
train took 48.05 sec: loss = 1.0775, top1_acc = 73.8095, top5_acc = 92.8571
valid took 20.40 sec: loss = 1.0166, top1_acc = 55.5556, top5_acc = 88.8889
lr=[0.0003337159107173202]
train took 45.60 sec: loss = 0.9669, top1_acc = 78.5714, top5_acc = 100.0000
valid took 20.35 sec: loss

In [0]:
learn.load("afl" + "_046", "checkpoints/ig65m_afl")

# **Testing**

In [0]:
device = torch.device("cuda")

In [0]:
num_segments = 10
test_ds = VideoDataset(
    split_file=r2plus1d_cfgs['valid_split'],
    video_dir=r2plus1d_cfgs['video_dir'],
    num_segments=num_segments,
    sample_length=r2plus1d_cfgs['sample_length'],
    sample_step=1,
    input_size=112,
    im_scale=r2plus1d_cfgs['im_scale'],
    resize_keep_ratio=True,
    mean=r2plus1d_cfgs['mean'],
    std=r2plus1d_cfgs['std'],
    random_shift=False,
    temporal_jitter=False,
    flip_ratio=0.0,
    random_crop=False,
    random_crop_scales=None,
    video_ext=r2plus1d_cfgs['video_ext'],
)

In [0]:
show_batch(
    test_ds[0][0],
    r2plus1d_cfgs['sample_length'],
    r2plus1d_cfgs['mean'],
    r2plus1d_cfgs['std']
)

Output hidden; open in https://colab.research.google.com to view.

In [0]:
model = learn.model
model.to(device)

model.eval()
infer_times = []
video_preds = []
video_trues = []
clip_preds = []
clip_trues = []

report_every = 100
with torch.no_grad():
    for i, (inputs, label) in enumerate(test_ds, start=1):
        if i % report_every == 0:
            print("{} samples have processed".format(i))
        
        inputs = inputs.to(device, non_blocking=True)
        
        start_time = time.time()
        outputs = model(inputs)
        infer_time = time.time() - start_time
        
        outputs = outputs.cpu().numpy()
        
        infer_times.append(infer_time)
        video_preds.append(outputs.sum(axis=0).argmax())
        video_trues.append(label)
        clip_preds.extend(outputs.argmax(axis=1))
        clip_trues.extend([label] * num_segments)
        
print("Done! {} samples have processed".format(len(test_ds)))

print("Avg. inference time per video (10 clips) =", np.array(infer_times).mean() * 1000, "ms")
print("Video prediction accuracy =", accuracy_score(video_trues, video_preds))
print("Clip prediction accuracy =", accuracy_score(clip_trues, clip_preds))



Done! 18 samples have processed
Avg. inference time per video (10 clips) = 8.222050136990019 ms
Video prediction accuracy = 0.8888888888888888
Clip prediction accuracy = 0.8388888888888889
