## Install dependencies

In [None]:
# # !wget -q https://www.dropbox.com/s/g5t24we9gl5yk88/TimeSformer_divST_8x32_224_K400.pyth
# !pip install -q simplejson einops timm scikit-learn tensorboard psutil einops
!pip install -q git+https://github.com/facebookresearch/fvcore
# !git clone https://github.com/down-shift/TimeSformer.git
# !pip install -q ./TimeSformer

In [None]:
%mkdir rzd_video
%cd rzd_video
!wget https://zaborshicov.ru/hiden/videos.zip
!unzip -q videos.zip
%rm videos.zip
%cd ..

## Import libraries and setup constants

In [None]:
import numpy as np
import pandas as pd
import cv2
import os
import torch

from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
# from timesformer.models.vit import TimeSformer


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SEED = 42
input_size = 224
DATA_DIR = '/kaggle/working/rzd_video'
CLASSES = dict([(i, cl) for i, cl in enumerate(sorted(os.listdir(DATA_DIR)))])

In [None]:
CLASSES

In [None]:
!wget https://gist.githubusercontent.com/willprice/f19da185c9c5f32847134b87c1960769/raw/9dc94028ecced572f302225c49fcdee2f3d748d8/kinetics_400_labels.csv

In [None]:
!wget https://gist.githubusercontent.com/willprice/f19da185c9c5f32847134b87c1960769/raw/9dc94028ecced572f302225c49fcdee2f3d748d8/kinetics_700_labels.csv
kin700 = pd.read_csv('kinetics_700_labels.csv')['name'].tolist()
found_ids = set()
for i, cl in CLASSES.items():
    for kin_cl in kin700:
        for word in cl.split():
            if word in kin_cl:
                print(i, cl, kin_cl)
                found_ids.add(i)

In [None]:
{i for i in range(24)} - found_ids

## Utils

In [None]:
def freeze(model, freeze_head=True):
    for name, param in model.named_parameters():
        if 'head' in name:
            param.requires_grad = freeze_head
        else:
            param.requires_grad = False
    return model

In [None]:
from IPython.display import clear_output


def show_history(epoch, history):
    clear_output()
    print(f'--- Epoch {epoch} ---')
    plt.figure(figsize=(20, 10))
    plt.subplot(2, 3, 1)
    plt.title('Train loss graph')
    plt.plot(history['train_loss'])
    plt.grid()
    plt.subplot(2, 3, 2)
    plt.title('Val loss graph')
    plt.plot(history['val_loss'])
    plt.grid()
    plt.subplot(2, 3, 3)
    plt.title('Precision graph')
    plt.plot(history['precision'])
    plt.grid()
    plt.subplot(2, 3, 4)
    plt.title('Recall graph')
    plt.plot(history['recall'])
    plt.grid()
    plt.subplot(2, 3, 5)
    plt.title('F1-score graph')
    plt.plot(history['f1'])
    plt.grid()
    plt.subplot(2, 3, 6)
    plt.title('Accuracy graph')
    plt.plot(history['acc'])
    plt.grid()
    plt.show()
    print("\n--- Validation losses ---")
    print(f"\nAverage train loss: {np.mean(history['train_loss'])}, average test loss: {np.mean(history['val_loss'])}\n")
    print('--- Validation metrics ---')
    print(f"\nAccuracy: {history['acc'][-1]}, precision: {history['precision'][-1]}, recall: {history['recall'][-1]}, f1-score: {history['f1'][-1]}\n")


## Define dataset

In [None]:
def read_video(path, frames_num=8):
    frames = []
    cap = cv2.VideoCapture(path)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    N = length//(frames_num)
    assert N > 0, 'Too many frames requested'
    current_frame = 0
    for i in range(length):
        ret, frame = cap.read(current_frame)
        if ret and i == current_frame and len(frames) < frames_num:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
            current_frame += N
    cap.release()
    return frames

In [None]:
class VideoDataset(Dataset):
    def __init__(self, root, classes, num_frames, transform=None):
        self.num_frames = num_frames
        self.video_paths = []
        self.labels = []
        self.transform = transform
        for idx, c in classes.items():
            self.video_paths.extend([os.path.join(root, c, f) for f in os.listdir(os.path.join(root, c))])
            self.labels.extend([idx for _ in range(len(os.listdir(os.path.join(root, c))))])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        frames = read_video(self.video_paths[idx], frames_num=self.num_frames)
        # frames = np.array(vid.return_list(total_frames=self.num_frames))
        frames = torch.tensor(np.array(frames))
        if len(frames) != self.num_frames:
            print(f'len mismatch: {len(frames)} vs {self.num_frames}')
        if self.transform:
            frames = self.transform(frames)
        label = self.labels[idx]
        return (frames, label)

In [None]:
transform = transforms.Compose([
    transforms.Lambda(lambda x: x / 255.),
    transforms.Lambda(lambda x: x.permute(3, 0, 1, 2)),  # (T, C, H, W)
    transforms.Lambda(lambda x: torch.nn.functional.interpolate(x, (224, 224))),
    transforms.RandomHorizontalFlip(p=0.5),
])

full_dataset = VideoDataset(root=DATA_DIR, classes=CLASSES, num_frames=16, transform=transform)
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size],
                                                           torch.Generator().manual_seed(SEED))

## Get dataloaders

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

## Download the pre-trained TimeSformer

In [None]:
# model = TimeSformer(img_size=224, num_classes=len(CLASSES), num_frames=4, attention_type='divided_space_time')
# model.load_state_dict(torch.load('/content/drive/MyDrive/Olymps/leadersofdigital/stavropol/timesformer_full_ep1_0.9036.pt'))
# model = model.to(device)
# dummy_video = torch.zeros(2, 3, 4, 224, 224).to(device) # (batch x channels x frames x height x width)

# pred = model(dummy_video,)
# pred.shape

## X3D

In [None]:
model = torch.hub.load('facebookresearch/pytorchvideo', 'x3d_m', pretrained=True)
model.blocks[5].proj = nn.Linear(in_features=2048, out_features=24, bias=True)
model = model.to(device)

## Prepare for training

In [None]:
# # freeze model
# model = freeze(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5,
                              betas=(0.9, 0.999), weight_decay=0.01)

In [None]:
history = {'train_loss': [], 'val_loss': [], 'acc': [], 'f1': [], 'precision': [], 'recall': []}

epochs = 10
best_loss = 0.9
best_f1 = 0.65

for epoch in range(epochs):
    print(f'--- Epoch {epoch} ---\n')

    print('Training...')
    # training loop
    model.train()
    progress_bar = tqdm(train_loader)
    train_loss, val_loss = 0, 0
    for id, (videos, labels) in enumerate(progress_bar):
        torch.cuda.empty_cache()
        videos = videos.to(device)
        labels = labels.to(device)
        torch.cuda.empty_cache()

        optimizer.zero_grad()
        out = model(videos)
        loss = criterion(out, labels)
        train_loss += loss.item()
        history['train_loss'].append(loss.item())
        progress_bar.set_postfix({'loss': loss.item()})
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()

    print('\nValidating...')
    # validation loop
    model.eval()
    progress_bar = tqdm(val_loader)
    preds, gt = [], []
    for id, (videos, labels) in enumerate(progress_bar):
        torch.cuda.empty_cache()
        videos = videos.to(device)
        labels = labels.to(device)
        torch.cuda.empty_cache()

        out = model(videos)
        torch.cuda.empty_cache()
        preds.extend(out.argmax(dim=1).tolist())
        gt.extend(labels.tolist())
        loss = criterion(out, labels)
        val_loss += loss.item()
        history['val_loss'].append(loss.item())
        progress_bar.set_postfix({'loss': loss.item()})
        torch.cuda.empty_cache()

    history['acc'].append(accuracy_score(preds, gt))
    history['f1'].append(f1_score(preds, gt, average='weighted'))
    history['precision'].append(precision_score(preds, gt, average='weighted'))
    history['recall'].append(recall_score(preds, gt, average='weighted'))

    # save weights
    if history['f1'][-1] > best_f1:
        print(f"New best model, f1_score: {history['f1'][-1]}")
        best_f1 = history['f1'][-1]
        torch.save(model.state_dict(), f"x3d_m_ep{epoch}_{history['f1'][-1]:.4f}.pt")
        print('Saved to:', f"timesformer_ep{epoch}_{history['f1'][-1]:.4f}.pt")

    show_history(epoch, history)


In [None]:
class_res = {i: [0, 0] for i in range(len(CLASSES))}
class_ratios = {cls: 0 for _, cls in CLASSES.items()}

for i in range(len(gt)):
    class_res[gt[i]][0] += int(gt[i] == preds[i])
    class_res[gt[i]][1] += 1
for i, cls in CLASSES.items():
    class_ratios[cls] = class_res[i][0] / class_res[i][1]
class_ratios

In [None]:
print(f1_score(preds, gt, average='weighted'))
print(precision_score(preds, gt, average='weighted'))
print(recall_score(preds, gt, average='weighted'))