In [None]:
import os
import torch
import yaml

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,garbage_collection_threshold:0.8"

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

torch.cuda.empty_cache()

In [None]:
with open("./RSL.yaml") as f:
    config = yaml.safe_load(f)

In [None]:
from torchvision import transforms
from VideoMAE import MixedGestureDataset, VideoGestureDataset, collate_fn, plot_class_distribution, train, show_clip_from_dataset, evaluate_test

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: torch.clamp(x * 1.3, 0, 1)),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

train_dataset = MixedGestureDataset(
    video_frames_dir=config['vit']['train_vid'],
    video_ann_path=config['vit']['train_vid_anno'],
    image_dir=config['vit']['train_img'],
    image_ann_path=config['vit']['train_img_anno'],
    clip_len=8,
    transform=transform
)

val_dataset = VideoGestureDataset(
    frames_dir=config['vit']['val_vid'],
    ann_path=config['vit']['val_vid_anno'],
    clip_len=8,
    transform=transform,
    label2id=train_dataset.label2id
)

test_dataset = VideoGestureDataset(
    frames_dir=config['vit']['test_vid'],
    ann_path=config['vit']['test_vid_anno'],
    clip_len=8,
    transform=transform,
    label2id=train_dataset.label2id
)

In [None]:
from torch.utils.data import DataLoader

batch_size = 8
num_workers = 0

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
    collate_fn=collate_fn
)

In [None]:
from transformers import VideoMAEForVideoClassification
import torch.nn as nn

model = VideoMAEForVideoClassification.from_pretrained(
    config['vit']['pr_model'],
    num_labels=len(train_dataset.label2id),
    ignore_mismatched_sizes=True,
    num_frames=8,
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.2
)

model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, model.config.num_labels )
)

In [None]:
lbls={
    'l1': 'А',
    'l2': 'Б',
    'l3': 'В',
    'l4': 'Г',
    'l5': 'Д',
    'l6': 'Е',
    'l7': 'Ё',
    'l8': 'Ж',
    'l9': 'З',
    'l10': 'И',
    'l11': 'Й',
    'l12': 'К',
    'l13': 'Л',
    'l14': 'М',
    'l15': 'Н',
    'l16': 'О',
    'l17': 'П',
    'l18': 'Р',
    'l19': 'С',
    'l20': 'Т',
    'l21': 'У',
    'l22': 'Ф',
    'l23': 'х',
    'l24': 'Ц',
    'l25': 'Ч',
    'l26': 'Ш',
    'l27': 'Щ',
    'l28': 'Ъ',
    'l29': 'Ы',
    'l30': 'Ь',
    'l31': 'Э',
    'l32': 'Ю',
    'l33': 'Я'
}

In [None]:
plot_class_distribution(val_dataset, lbls, 'валидационном')

In [None]:
plot_class_distribution(train_dataset, lbls, "обучающем")

In [None]:
plot_class_distribution(test_dataset, lbls, "тестовом")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
train(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    epochs=10,
    lr=1e-5,
    log_dir=config['vit']['log_dir']
)

In [None]:
show_clip_from_dataset(train_dataset, idx=4000)

In [None]:
evaluate_test(model, test_loader, device)