In [6]:
!pip install PyAV

In [10]:
from google.colab import drive
import sys

drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/dads-7202-final-project')

ModuleNotFoundError: No module named 'google.colab'

In [25]:
import os
import torch
from torch import nn
from torchvision.io import read_video
from torchvision.datasets import VisionDataset
from torch.utils.data import DataLoader
from torchvision.models.video import mvit_v2_s,MViT_V2_S_Weights
from torchvision.models import swin_v2_t,Swin_V2_T_Weights
from torchvision.models import resnet152,ResNet152_Weights
# from custom_dataset import CustomVidDataset
import pandas as pd
import numpy as np
from sklearn.metrics import  accuracy_score,precision_score,recall_score,f1_score,average_precision_score
from sklearn.preprocessing import MultiLabelBinarizer
import gc
import random

In [8]:
# fix seed
torch.manual_seed(12)
random.seed(12)
np.random.seed(12)

In [9]:
class CustomVidDataset(VisionDataset):
    def __init__(self, annotations_file, vid_dir, transform= None, target_transform=None):
        self.vid_labels = pd.read_csv(annotations_file)
        self.vid_dir = vid_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.vid_labels)

    def __getitem__(self, idx):

        vid_path = os.path.join(self.vid_dir, self.vid_labels.iloc[idx, 0])
        vid,_,_ = read_video(vid_path,pts_unit='sec',end_pts=10.0,output_format='TCHW')
        label = self.vid_labels.iloc[idx, 5:17]
        array_label = label.to_numpy()
        array_label = array_label.astype(int)
        trans_vid = vid[:16] # slice only first 16 frames
        if self.transform:
            trans_vid = self.transform(trans_vid)
        if self.target_transform:
            label = self.target_transform(array_label)
        return trans_vid, array_label

In [10]:
#Create Transform for preprocessing

#MViT
transforms_mvit = MViT_V2_S_Weights.KINETICS400_V1.transforms()
#Swin
transforms_swin = Swin_V2_T_Weights.IMAGENET1K_V1.transforms()
#Resnet152
transforms_resnet = ResNet152_Weights.IMAGENET1K_V2.transforms()

In [11]:
#Custom Dataset by Transform (Preprocess)
PATH = '/content/drive/MyDrive/dads-7202-final-project/'

def custom_data(model_type, transform):
    train_data = CustomVidDataset(PATH + 'data/train_label.csv', PATH + 'data/train', transform=transform)
    valid_data = CustomVidDataset(PATH + 'data/validate_label.csv', PATH + 'data/validate', transform=transform)
    test_data = CustomVidDataset(PATH + 'data/test_label.csv', PATH + 'data/test', transform=transform)
    return train_data, valid_data, test_data

#MViT
train_data_mvit, valid_data_mvit, test_data_mvit = custom_data('mvit', transforms_mvit)
#Swin
train_data_swin, valid_data_swin, test_data_swin = custom_data('swin', transforms_swin)
#Resnet152
train_data_resnet, valid_data_resnet, test_data_resnet = custom_data('resnet', transforms_resnet)


In [12]:
#Create Data Loader
def create_data_loader(dataset, batch_size=16, shuffle=True, num_workers=4):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers
    )

#MViT
train_dataset_mvit = create_data_loader(train_data_mvit)
valid_dataset_mvit = create_data_loader(valid_data_mvit)
test_dataset_mvit = create_data_loader(test_data_mvit)

#Swin
train_dataset_swin = create_data_loader(train_data_swin)
valid_dataset_swin = create_data_loader(valid_data_swin)
test_dataset_swin = create_data_loader(test_data_swin)

#Resnet152
train_dataset_resnet = create_data_loader(train_data_resnet)
valid_dataset_resnet = create_data_loader(valid_data_resnet)
test_dataset_resnet = create_data_loader(test_data_resnet)

In [13]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [24]:
#Set up Model (MViT)
model_mvit = mvit_v2_s(weights="KINETICS400_V1")
for param in model_mvit.parameters():
    param.requires_grad = False

for param in model_mvit.blocks[15].parameters():
    param.requires_grad = True

model_mvit.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(768,12),
            nn.Sigmoid()
        )
print(model_mvit)

MViT(
  (conv_proj): Conv3d(3, 96, kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))
  (pos_encoding): PositionalEncoding()
  (blocks): ModuleList(
    (0): MultiscaleBlock(
      (norm1): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (norm2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (attn): MultiscaleAttention(
        (qkv): Linear(in_features=96, out_features=288, bias=True)
        (project): Sequential(
          (0): Linear(in_features=96, out_features=96, bias=True)
        )
        (pool_q): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          )
        )
        (pool_k): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,

In [26]:
#Set up Model (Swin)
model_swin = swin_v2_t(weights="IMAGENET1K_V1")
for param in model_swin.parameters():
    param.requires_grad = False

for param in model_swin.features[7].parameters():
    param.requires_grad = True

model_swin.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(768,12),
            nn.Sigmoid()
        )
print(model_swin)

SwinTransformer(
  (features): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (1): Permute()
      (2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
    )
    (1): Sequential(
      (0): SwinTransformerBlockV2(
        (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (attn): ShiftedWindowAttentionV2(
          (qkv): Linear(in_features=96, out_features=288, bias=True)
          (proj): Linear(in_features=96, out_features=96, bias=True)
          (cpb_mlp): Sequential(
            (0): Linear(in_features=2, out_features=512, bias=True)
            (1): ReLU(inplace=True)
            (2): Linear(in_features=512, out_features=3, bias=False)
          )
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (0): Linear(in_features=96, out_features=384, bias=True)
          (1): GELU(approximate='

In [27]:
#Set up Model (ResNet)
model_resnet = resnet152(weights="IMAGENET1K_V2")
for param in model_resnet.parameters():
    param.requires_grad = False

for param in model_resnet.layer4.parameters():
    param.requires_grad = True

model_resnet.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(1000,12),
            nn.Sigmoid()
        )
print(model_resnet)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [15]:
model_mvit = model_mvit.to(device)
model_swin = model_swin.to(device)
model_resnet = model_resnet.to(device)

In [16]:
#Create Loss Function and Optimizer
loss_fn = nn.BCELoss()

def create_optimizer(model, learning_rate=1e-3):
    return torch.optim.Adam(model.parameters(), lr=learning_rate)

optimizer_mvit = create_optimizer(model_mvit)
optimizer_swin = create_optimizer(model_swin)
optimizer_resnet = create_optimizer(model_resnet)


In [21]:
def scoring(y_true,prediction):

    acc = accuracy_score(y_true,prediction)

    f1s = f1_score(y_true,prediction,average = 'samples', zero_division=0)

    rec_s = recall_score(y_true,prediction,average = 'samples', zero_division=0)

    preci_s = precision_score(y_true,prediction,average = 'samples', zero_division=0)

    ap_s = average_precision_score(y_true,prediction,average = 'samples')
    return acc,f1s,rec_s,preci_s,ap_s

In [22]:
def train(train_dataset,valid_dataset, model, loss_fn, optimizer):

    num_label = 12
    class_label = list(range(num_label))
    label_binarizer = MultiLabelBinarizer(classes=class_label)

    model.train()

    train_loss = 0.
    valid_loss = 0.
    train_y_true_lst = []
    train_y_pred_lst = []

    valid_y_true_lst = []
    valid_y_pred_lst = []

    for data in train_dataset:
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))
        X, y = data
        X, y = X.to(device), y.to(device)

        #  clear the grad
        optimizer.zero_grad()

        # predict
        pred = model(X)
        # compute loss
        loss = loss_fn(pred, y.float())

        # Backpropagation
        loss.backward()
        # update new weight
        optimizer.step()

        round_pred = np.round(pred.detach().cpu())

        train_loss += loss.item()*X.size(0)
        train_y_true_lst.append(y.cpu())
        train_y_pred_lst.append(round_pred)

        # delete locals
        del X
        del y
        del loss
        del pred

        # clean the cache
        if device == "cuda":
            torch.cuda.empty_cache()
        elif device == "mps":
            torch.mps.empty_cache()
        # collect the garbage
        gc.collect()

    # evaluation
    model.eval()
    for data in valid_dataset:
        X, y = data
        X, y = X.to(device), y.to(device)
        # predict
        pred = model(X)
        # compute loss
        loss = loss_fn(pred, y.float())

        round_pred = np.round(pred.detach().cpu())

        valid_loss += loss.item()*X.size(0)
        valid_y_true_lst.append(y.cpu())
        valid_y_pred_lst.append(round_pred)

        # delete locals
        del X
        del y
        del loss
        del pred

        # clean the cache
        if device == "cuda":
            torch.cuda.empty_cache()
        elif device == "mps":
            torch.mps.empty_cache()
        # collect the garbage
        gc.collect()

    # calculate average loss over an epoch
    train_loss = train_loss/len(train_dataset.sampler)
    valid_loss = valid_loss/len(valid_dataset.sampler)

    train_y_pred = torch.cat(train_y_pred_lst, dim=0).numpy()
    train_y_pred = label_binarizer.fit_transform(train_y_pred)

    train_y_true = torch.cat(train_y_true_lst, dim=0).numpy()
    train_y_true = label_binarizer.fit_transform(train_y_true)

    valid_y_pred = torch.cat(valid_y_pred_lst, dim=0).numpy()
    valid_y_pred = label_binarizer.fit_transform(valid_y_pred)

    valid_y_true = torch.cat(valid_y_true_lst, dim=0).numpy()
    valid_y_true = label_binarizer.fit_transform(valid_y_true)

    t_acc,t_f1s,t_rec_s,t_preci_s,t_ap_s = scoring(train_y_true,train_y_pred)
    v_acc,v_f1s,v_rec_s,v_preci_s,v_ap_s = scoring(valid_y_true,valid_y_pred)
    print(f"AVG Training Loss: {round(train_loss,4)}\nAVG Validation Loss: {round(valid_loss,4)}")
    print(f"Accuracy: {round(t_acc,4)}\nF1: {round(t_f1s,4)}\nRecall: {round(t_rec_s,4)}\nPrecision: {round(t_preci_s,4)}\nAP: {round(t_ap_s,4)}")
    return train_loss,valid_loss

In [23]:
def test(test_dataset, model, loss_fn):

  num_label = 12
  class_label = list(range(num_label))
  label_binarizer = MultiLabelBinarizer(classes=class_label)

  test_loss = 0.
  test_y_true_lst = []
  test_y_pred_lst = []

  with torch.no_grad():
    model.eval()
    for data in valid_dataset_mvit:
      X, y = data
      X, y = X.to(device), y.to(device)
      pred = model(X)
      loss = loss_fn(pred, y.float())

      round_pred = np.round(pred.detach().cpu())

      test_loss += loss.item()*X.size(0)
      test_y_true_lst.append(y.cpu())
      test_y_pred_lst.append(round_pred)

    test_loss = test_loss/len(train_dataset_mvit.sampler)

    y_pred = torch.cat(test_y_pred_lst, dim=0).numpy()
    y_pred = label_binarizer.fit_transform(y_pred)

    y_true = torch.cat(test_y_true_lst, dim=0).numpy()
    y_true = label_binarizer.fit_transform(y_true)

    acc,f1s,rec_s,preci_s,ap_s = scoring(y_true,y_pred)

    print(f"AVG Test Loss: {round(test_loss,4)}")
    print(f"Accuracy: {round(acc,4)}\nF1: {round(f1s,4)}\nRecall: {round(rec_s,4)}\nPrecision: {round(preci_s,4)}\nAP: {round(ap_s,4)}")
    return test_loss

In [24]:
#Train MViT
all_train_loss_mvit = []
all_valid_loss_mvit = []
epochs = 1
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss_mvit, valid_loss_mvit = train(train_dataset_mvit,valid_dataset_mvit, model_mvit, loss_fn, optimizer_mvit)
    all_train_loss_mvit.append(train_loss_mvit)
    all_valid_loss_mvit.append(all_valid_loss_mvit)
print("Done!")

Epoch 1
-------------------------------
AVG Training Loss: 0.2081
AVG Validation Loss: 0.1774
Accuracy: 0.4858
F1: 0.8286
Recall: 0.7429
Precision: 1.0
AP: 0.7858
Done!


In [25]:
#Test MViT
test_loss_mvit = test(test_dataset_mvit,model_mvit,loss_fn)

AVG Test Loss: 0.0223
Accuracy: 0.6
F1: 0.8667
Recall: 0.8
Precision: 1.0
AP: 0.8333


In [None]:
#Train Swin
all_train_loss_swin = []
all_valid_loss_swin = []
epochs = 1
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss_swin, valid_loss_swin = train(train_dataset_swin,valid_dataset_swin, model_swin, loss_fn, optimizer_swin)
    all_train_loss_swin.append(train_loss_swin)
    all_valid_loss_swin.append(all_valid_loss_swin)
print("Done!")

In [None]:
#Test Swin
test_loss_swin = test(test_dataset_swin,model_swin,loss_fn)

In [None]:
#Train ResNet152
all_train_loss_resnet = []
all_valid_loss_resnet = []
epochs = 1
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss_resnet, valid_loss_resnet = train(train_dataset_resnet,valid_dataset_resnet, model_resnet, loss_fn, optimizer_resnet)
    all_train_loss_resnet.append(train_loss_resnet)
    all_valid_loss_resnet.append(all_valid_loss_resnet)
print("Done!")

In [None]:
#Test ResNet152
test_loss_resnet = test(test_dataset_resnet,model_resnet,loss_fn)