In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from einops import rearrange
from decord import VideoReader
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from segmentation_models_pytorch.losses import FocalLoss
from transformers import AutoModel, AutoImageProcessor, AutoConfig
from skmultilearn.model_selection import iterative_train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorchvideo.transforms.transforms_factory import create_video_transform



In [2]:
import math
from torch.optim import Optimizer

class AdaMod(Optimizer):
    """Implements AdaMod algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
    It has been proposed in `Adaptive and Momental Bounds for Adaptive Learning Rate Methods`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        beta3 (float, optional): smoothing coefficient for adaptive learning rates (default: 0.9999)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), beta3=0.999,
                 eps=1e-8, weight_decay=0):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= beta3 < 1.0:
            raise ValueError("Invalid beta3 parameter: {}".format(beta3))
        defaults = dict(lr=lr, betas=betas, beta3=beta3, eps=eps,
                        weight_decay=weight_decay)
        super(AdaMod, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdaMod, self).__setstate__(state)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        'AdaMod does not support sparse gradients')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    # Exponential moving average of actual learning rates
                    state['exp_avg_lr'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq, exp_avg_lr = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_lr']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                if group['weight_decay'] != 0:
                    p.data.add_(-group['weight_decay'] * group['lr'], p.data)

                # Applies momental bounds on actual learning rates
                step_size = torch.full_like(denom, step_size)
                step_size.div_(denom)
                exp_avg_lr.mul_(group['beta3']).add_(1 - group['beta3'], step_size)
                step_size = torch.min(step_size,  exp_avg_lr)
                step_size.mul_(exp_avg)

                p.data.add_(-step_size)

        return loss

In [3]:
#https://github.com/issamemari/pytorch-multilabel-balanced-sampler/blob/master/sampler.py
import random
import numpy as np

from torch.utils.data.sampler import Sampler


class MultilabelBalancedRandomSampler(Sampler):
    """
    MultilabelBalancedRandomSampler: Given a multilabel dataset of length n_samples and
    number of classes n_classes, samples from the data with equal probability per class
    effectively oversampling minority classes and undersampling majority classes at the
    same time. Note that using this sampler does not guarantee that the distribution of
    classes in the output samples will be uniform, since the dataset is multilabel and
    sampling is based on a single class. This does however guarantee that all classes
    will have at least batch_size / n_classes samples as batch_size approaches infinity
    """

    def __init__(self, labels, indices=None, class_choice="least_sampled"):
        """
        Parameters:
        -----------
            labels: a multi-hot encoding numpy array of shape (n_samples, n_classes)
            indices: an arbitrary-length 1-dimensional numpy array representing a list
            of indices to sample only from
            class_choice: a string indicating how class will be selected for every
            sample:
                "least_sampled": class with the least number of sampled labels so far
                "random": class is chosen uniformly at random
                "cycle": the sampler cycles through the classes sequentially
        """
        self.labels = labels
        self.indices = indices
        if self.indices is None:
            self.indices = range(len(labels))

        self.num_classes = self.labels.shape[1]

        # List of lists of example indices per class
        self.class_indices = []
        for class_ in range(self.num_classes):
            lst = np.where(self.labels[:, class_] == 1)[0]
            lst = lst[np.isin(lst, self.indices)]
            self.class_indices.append(lst)

        self.counts = [0] * self.num_classes

        assert class_choice in ["least_sampled", "random", "cycle"]
        self.class_choice = class_choice
        self.current_class = 0

    def __iter__(self):
        self.count = 0
        return self

    def __next__(self):
        if self.count >= len(self.indices):
            raise StopIteration
        self.count += 1
        return self.sample()

    def sample(self):
        class_ = self.get_class()
        class_indices = self.class_indices[class_]
        chosen_index = np.random.choice(class_indices)
        if self.class_choice == "least_sampled":
            for class_, indicator in enumerate(self.labels[chosen_index]):
                if indicator == 1:
                    self.counts[class_] += 1
        return chosen_index

    def get_class(self):
        if self.class_choice == "random":
            class_ = random.randint(0, self.labels.shape[1] - 1)
        elif self.class_choice == "cycle":
            class_ = self.current_class
            self.current_class = (self.current_class + 1) % self.labels.shape[1]
        elif self.class_choice == "least_sampled":
            min_count = self.counts[0]
            min_classes = [0]
            for class_ in range(1, self.num_classes):
                if self.counts[class_] < min_count:
                    min_count = self.counts[class_]
                    min_classes = [class_]
                if self.counts[class_] == min_count:
                    min_classes.append(class_)
            try:
                class_ = np.random.choice(min_classes)
            except: pass
        return class_

    def __len__(self):
        return len(self.indices)

In [4]:
config = {
    "seed":42,
    "model_name":"facebook/timesformer-base-finetuned-k400",
    "batch_size":5,
    "learning_rate":1e-5,
    "data_dir":'../data/CarCollision',
    "checkpoint_dir":'./checkpoint',
    "submission_dir":'./submission',
    "n_classes":(2,3,4,3),
    "label_dict":{
        -1:[-1,-1,-1,-1],
        0:[0,0,0,0],
        1:[1,1,1,1],
        2:[1,1,1,2],
        3:[1,1,2,1],
        4:[1,1,2,2],
        5:[1,1,3,1],
        6:[1,1,3,2],
        7:[1,2,1,1],
        8:[1,2,1,2],
        9:[1,2,2,1],
        10:[1,2,2,2],
        11:[1,2,3,1],
        12:[1,2,3,2]
    },
    "label_reverse_dict":{
        (0,0,0,0):0,
        (1,1,1,1):1,
        (1,1,1,2):2,
        (1,1,2,1):3,
        (1,1,2,2):4,
        (1,1,3,1):5,
        (1,1,3,2):6,
        (1,2,1,1):7,
        (1,2,1,2):8,
        (1,2,2,1):9,
        (1,2,2,2):10,
        (1,2,3,1):11,
        (1,2,3,2):12,
    }
}

In [5]:
pl.seed_everything(config['seed'])

Global seed set to 42


42

In [6]:
train_df = pd.read_csv(f"{config['data_dir']}/train.csv")
test_df = pd.read_csv(f"{config['data_dir']}/test.csv")

In [7]:
train_df['sample_id'] = train_df['sample_id'].apply(lambda x: int(x.split('_')[1]))
test_df['sample_id'] = test_df['sample_id'].apply(lambda x: int(x.split('_')[1]))

In [8]:
train_df['video_path'] = train_df['video_path'].apply(lambda x: config['data_dir'] + x[1:])
test_df['video_path'] = test_df['video_path'].apply(lambda x: config['data_dir'] + x[1:])

In [9]:
test_df['label']=-1
test_df['label_split'] = test_df['label'].apply(config['label_dict'].get)

In [10]:
train_df['label_split'] = train_df['label'].apply(config['label_dict'].get)
train_label_split = np.array(train_df['label_split'].tolist())

In [11]:
train_label_multi_hot = np.hstack([np.eye(n_class, dtype=np.int32)[train_label_split[:,idx]] for idx, n_class in enumerate(config['n_classes'])])
train_df['label_multi_hot'] = train_label_multi_hot.tolist()

In [12]:
train_df

Unnamed: 0,sample_id,video_path,label,label_split,label_multi_hot
0,0,../data/CarCollision/train/TRAIN_0000.mp4,7,"[1, 2, 1, 1]","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0]"
1,1,../data/CarCollision/train/TRAIN_0001.mp4,7,"[1, 2, 1, 1]","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0]"
2,2,../data/CarCollision/train/TRAIN_0002.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"
3,3,../data/CarCollision/train/TRAIN_0003.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"
4,4,../data/CarCollision/train/TRAIN_0004.mp4,1,"[1, 1, 1, 1]","[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]"
...,...,...,...,...,...
2693,2693,../data/CarCollision/train/TRAIN_2693.mp4,3,"[1, 1, 2, 1]","[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0]"
2694,2694,../data/CarCollision/train/TRAIN_2694.mp4,5,"[1, 1, 3, 1]","[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]"
2695,2695,../data/CarCollision/train/TRAIN_2695.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"
2696,2696,../data/CarCollision/train/TRAIN_2696.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"


In [13]:
train_df_for_dataset, _ , val_df_for_dataset, _  = iterative_train_test_split(X=train_df.values, y=train_label_multi_hot, test_size=0.2)
test_df_for_dataset = test_df.values

In [14]:
train_multi_hot_for_sampler = np.array(train_df_for_dataset[:,4].tolist())

In [15]:
class VideoDataset(Dataset):
    def __init__(self, df_for_dataset, transform=None):
        self.sample_id = df_for_dataset[:,0]
        self.video_path = df_for_dataset[:,1]
        self.label = df_for_dataset[:,2]
        self.label_split = np.array(df_for_dataset[:,3].tolist())
        self.transform = transform

    def __len__(self):
        return len(self.sample_id)

    def __getitem__(self, idx):
        sample_id = self.sample_id[idx]
        video_path = self.video_path[idx]
        vr = VideoReader(video_path)
        video = torch.from_numpy(vr.get_batch(range(50)).asnumpy())
        video = rearrange(video, 't h w c -> c t h w')
        label = self.label[idx]
        label_split = self.label_split[idx]
        
        if self.transform:
            video = self.transform(video)
        video = rearrange(video, 'c t h w -> t c h w')

        sample = {
            'sample_id':sample_id,
            'video':video,
            'label':label,
            'label_split':label_split
        }
        
        return sample

In [16]:
model_config = AutoConfig.from_pretrained(config['model_name'])
image_processor_config = AutoImageProcessor.from_pretrained(config['model_name'])

In [17]:
train_transform = create_video_transform(
    mode='train',
    num_samples=model_config.num_frames,
    video_mean = tuple(image_processor_config.image_mean),
    video_std = tuple(image_processor_config.image_std),
    crop_size = tuple(image_processor_config.crop_size.values())
)

val_transform = create_video_transform(
    mode='val',
    num_samples=model_config.num_frames,
    video_mean = tuple(image_processor_config.image_mean),
    video_std = tuple(image_processor_config.image_std),
    crop_size = tuple(image_processor_config.crop_size.values())
)

In [18]:
train_dataset = VideoDataset(train_df_for_dataset, transform=train_transform)
val_dataset = VideoDataset(val_df_for_dataset, transform=val_transform)
test_dataset = VideoDataset(test_df_for_dataset, transform=val_transform)

In [19]:
train_sampler = MultilabelBalancedRandomSampler(train_multi_hot_for_sampler)
train_dataloader = DataLoader(train_dataset, batch_size= config['batch_size'], sampler=train_sampler)
val_dataloader = DataLoader(val_dataset, batch_size = config['batch_size']*2)
test_dataloader = DataLoader(test_dataset, batch_size = config['batch_size']*2)

In [20]:
class PLVideoModel(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.learning_rate = config['learning_rate']
        self.model = AutoModel.from_pretrained(config['model_name'])
        self.classifiers = nn.ModuleList([
            nn.LazyLinear(n_class) for n_class in config['n_classes']
        ])
        self.loss = FocalLoss('multiclass')

    def forward(self, x):
        x = self.model(x).last_hidden_state.mean(dim=1)
        x_out = [classifier(x) for classifier in self.classifiers]
        return x_out

    def training_step(self, batch, batch_idx):
        video, label, label_split = batch['video'], batch['label'], batch['label_split']
        y_hats = self.forward(batch["video"])
        loss = sum([self.loss(y_hats[i], batch["label_split"][:,i]) for i in range(len(self.config['n_classes']))])
        loss = loss/len(self.config['n_classes'])
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        video, label, label_split = batch['video'], batch['label'], batch['label_split']
        y_hats = self.forward(batch["video"])
        step_output = [*y_hats, label]
        return step_output
    
    def predict_step(self, batch, batch_idx):
        video, _, _ = batch['video'], batch['label'], batch['label_split']
        y_hats = self.forward(batch["video"])
        step_output = y_hats
        return step_output

    def validation_epoch_end(self, step_outputs):
        pred1, pred2, pred3, pred4, label = [], [], [], [], []
        for step_output in step_outputs:
            pred1.append(step_output[0])
            pred2.append(step_output[1])
            pred3.append(step_output[2])
            pred4.append(step_output[3])
            label.append(step_output[4])
            
        pred1 = torch.cat(pred1).argmax(1)
        pred2 = torch.cat(pred2).argmax(1)
        pred3 = torch.cat(pred3).argmax(1)
        pred4 = torch.cat(pred4).argmax(1)
        label = torch.cat(label).tolist()

        pred = torch.stack([pred1,pred2,pred3,pred4],dim=1).cpu().detach().numpy().tolist()
        pred = list(map(lambda x: self.config['label_reverse_dict'].get(tuple(x),0),pred))
        
        score = f1_score(label,pred, average='macro')
        self.log("val_score", score)
        return score
    
    def post_preproc(self, step_outputs):
        pred1, pred2, pred3, pred4 = [], [], [], []
        for step_output in step_outputs:
            pred1.append(step_output[0])
            pred2.append(step_output[1])
            pred3.append(step_output[2])
            pred4.append(step_output[3])
            
        pred1 = torch.cat(pred1).argmax(1)
        pred2 = torch.cat(pred2).argmax(1)
        pred3 = torch.cat(pred3).argmax(1)
        pred4 = torch.cat(pred4).argmax(1)

        pred = torch.stack([pred1,pred2,pred3,pred4],dim=1).cpu().detach().numpy().tolist()
        pred = list(map(lambda x: self.config['label_reverse_dict'].get(tuple(x),0),pred))

        return pred
            
    def configure_optimizers(self):
        optimizer = AdaMod(self.parameters(), lr=self.learning_rate)
        return [optimizer]

In [21]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_score',
    dirpath=config['checkpoint_dir'],
    filename=f'{config["model_name"]}'+'-{epoch:02d}-{train_loss:.4f}-{val_score:.4f}',
    mode='max'
)
early_stop_callback = EarlyStopping(
    monitor="train_loss",
    patience=3,
    verbose=False,
    mode="min"
)

pl_video_model = PLVideoModel(config)

trainer = pl.Trainer(
    max_epochs=100,
    accelerator='auto', 
    precision=16,
    callbacks=[early_stop_callback, checkpoint_callback]  
    # callbacks=[checkpoint_callback]
)
trainer.fit(pl_video_model, train_dataloader, val_dataloader)

Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k400 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using 16bit None Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1420.)
  exp_avg.mul_(beta1).add_(1 - beta1, grad)


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [22]:
pl_video_model_pretrained = PLVideoModel.load_from_checkpoint(
    "checkpoint/facebook/timesformer-base-finetuned-k400-epoch=06-train_loss=0.0033-val_score=0.5175.ckpt",
    config=config
)

trainer = pl.Trainer(accelerator='auto')
pred = trainer.predict(pl_video_model_pretrained, test_dataloader)

Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k400 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision

Predicting: 0it [00:00, ?it/s]

In [23]:
pred_post_proc = pl_video_model_pretrained.post_preproc(pred)

In [24]:
submit = pd.read_csv(f"{config['data_dir']}/sample_submission.csv")

In [25]:
submit['label'] = pred_post_proc

In [26]:
submit.to_csv(f"{config['submission_dir']}/submission13.csv", index=False)