In [1]:
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np
import cv2
import mediapipe as mp
import torch
import pandas as pd
from scipy.fft import fft, ifft
from torchvision.transforms import v2
import os
import timm


In [2]:
timm.list_models()

['bat_resnext26ts',
 'beit_base_patch16_224',
 'beit_base_patch16_384',
 'beit_large_patch16_224',
 'beit_large_patch16_384',
 'beit_large_patch16_512',
 'beitv2_base_patch16_224',
 'beitv2_large_patch16_224',
 'botnet26t_256',
 'botnet50ts_256',
 'caformer_b36',
 'caformer_m36',
 'caformer_s18',
 'caformer_s36',
 'cait_m36_384',
 'cait_m48_448',
 'cait_s24_224',
 'cait_s24_384',
 'cait_s36_384',
 'cait_xs24_384',
 'cait_xxs24_224',
 'cait_xxs24_384',
 'cait_xxs36_224',
 'cait_xxs36_384',
 'coat_lite_medium',
 'coat_lite_medium_384',
 'coat_lite_mini',
 'coat_lite_small',
 'coat_lite_tiny',
 'coat_mini',
 'coat_small',
 'coat_tiny',
 'coatnet_0_224',
 'coatnet_0_rw_224',
 'coatnet_1_224',
 'coatnet_1_rw_224',
 'coatnet_2_224',
 'coatnet_2_rw_224',
 'coatnet_3_224',
 'coatnet_3_rw_224',
 'coatnet_4_224',
 'coatnet_5_224',
 'coatnet_bn_0_rw_224',
 'coatnet_nano_cc_224',
 'coatnet_nano_rw_224',
 'coatnet_pico_rw_224',
 'coatnet_rmlp_0_rw_224',
 'coatnet_rmlp_1_rw2_224',
 'coatnet_rmlp_1_r

In [3]:
transform = v2.Compose(
    [
    v2.ToImage(),
    v2.Resize([100,160]),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ToTensor(),
    ])



In [4]:
# function get frame video


            

# tes = walk(dir='../../dataset/pervideo')
# print(len(tes[0]))
        

In [5]:
# create dataset
from torch.nn.utils.rnn import pad_sequence

class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, dir, transform ):
        super().__init__()
        self.dir = dir
        self.transform = transform
        self.list_of_frames, self.list_of_class_id, self.list_of_class_name = self.walk(self.dir)
        
        self.list_map_class = self.makeDictClass(dir=self.dir)
    
    def video2frame(self,videopath):
        _video = cv2.VideoCapture(videopath)
        _fps_in = _video.get(cv2.CAP_PROP_FPS)
        _count = 0
        _success = 1
        _frames = []
        
        _fps_out = 30

        _index_in = -1
        _index_out = -1
        
        # get class name
        _path = os.path.split(videopath)
        _path2class = os.path.split(_path[0])
        _className = _path2class[1]
        print(_className)
        
        # convert classname to id
        _map_class = self.makeDictClass(dir=self.dir)
        _classId = self.class2id(classname=_className, dictclass=_map_class)
        
        
        # extract video to frame
        while _success:
            _success = _video.grab()
            if not _success: break
            _index_in += 1
            _out_due = int(_index_in / _fps_in * _fps_out)
            if _out_due > _index_out:
                _success, _frame = _video.read()
                if(_success==True):
                    # print(_success)
                    _transformed = transform(_frame)
                    # print(_transformed.shape)
                    _frames.append(_transformed)
                else:
                    continue
        
        _video.release()
        
            
        return _frames, _classId, _className
    
    def makeDictClass(self, dir):
        dictClass = {}
        i = 0
        for _i in os.listdir(dir):
            if (os.path.isdir(os.path.join(dir, _i)) == True):
                # print(_i)
                dictClass[i] = _i
                i = i + 1
        return dictClass

    def class2id(self, classname, dictclass):
        id = 0
        for i in dictclass:
            if (dictclass[i] == classname):
                id = i
                break
        return id
        
    def walk(self, dir):
        list_of_frames = []
        list_of_class_name = []
        list_of_class_id = []
        
        for _i in os.listdir(dir):
           
            for _j in os.listdir(os.path.join(dir, _i)):
                _pathFile = os.path.join(dir, _i, _j)
                _pathFile = _pathFile.replace(os.sep, "/")
                _frames, _classId, _classname = self.video2frame(_pathFile)
                _frames = torch.stack(_frames)
                list_of_frames.append(_frames)
                # print(_frames.shape)
                # print(len(_frames))
                # list_of_frames = torch.stack(_frames)
                list_of_class_name.append(_classname)
                list_of_class_id.append(_classId)
        
        # list_of_frames = torch.stack(list_of_frames)
        list_of_frames_pad = pad_sequence(list_of_frames, batch_first=True)
        return list_of_frames_pad, list_of_class_id, list_of_class_name
        
    
    def __len__(self):
        return len(self.list_of_frames)
    
    def __getitem__(self, idx):
        class_name = self.list_of_class_name[idx]
        class_id = self.list_of_class_id[idx]
        frames = self.list_of_frames[idx]
        # print(frames)
        
        return frames, class_id-1, class_name
                
        
    
    
        

In [6]:
dir='../../dataset/WLASL100/'
data_torch = VideoDataset(dir=dir, transform=transform)


abdomen
abdomen
abdomen
abdomen
abdomen
accent
accent
accent
accent
accent
accept
accept
accept
accept
accept
accept
accept
accept
accident
accident
accident
accident
accident
accident
accident
accident
accident
accident
accident
accident
accident
affect
affect
affect
affect
affect
affect
again
again
again
again
again
again
again
again
ago
ago
ago
ago
ago
ago
ago
ago
ago
aim
aim
aim
aim
aim
already
already
already
already
already
already
already
already
annoy
annoy
annoy
annoy
appear
appear
appear
appear
appear
appear
appear
appointment
appointment
appointment
appointment
appointment
appointment
appointment
appointment
appointment
appointment
approve
approve
approve
approve
approve
approve
approve
approve
arm
arm
arm
arm
arm
arm
arm
arrest
arrest
arrest
arrest
arrest
arrest
article
article
article
article
authority
authority
authority
authority
authority
aware
aware
aware
aware
babysitter
babysitter
babysitter
babysitter
bad
bad
bad
bad
bad
bad
bad
bad
bad
bad
balance
balance
balance
b

In [7]:
o = 0
for i,j,k in data_torch:
    o = o+1
    # print(o)
    print(j)
    print(k)
    print(i)
    print(o)

-1
abdomen
tensor([[[[ 69,  69,  68,  ...,  57,  56,  58],
          [ 69,  69,  68,  ...,  60,  59,  61],
          [ 71,  70,  70,  ...,  67,  66,  66],
          ...,
          [ 39,  40,  40,  ...,  48,  46,  44],
          [ 37,  37,  37,  ...,  47,  45,  43],
          [ 37,  36,  36,  ...,  42,  41,  39]],

         [[126, 126, 125,  ..., 103, 102, 104],
          [126, 126, 125,  ..., 106, 105, 107],
          [126, 126, 125,  ..., 115, 114, 113],
          ...,
          [ 84,  85,  85,  ...,  95,  93,  91],
          [ 82,  82,  82,  ...,  94,  92,  90],
          [ 82,  81,  81,  ...,  89,  88,  86]],

         [[253, 253, 252,  ..., 221, 220, 222],
          [253, 253, 252,  ..., 224, 223, 225],
          [254, 253, 252,  ..., 232, 231, 231],
          ...,
          [213, 214, 214,  ..., 197, 195, 193],
          [211, 211, 211,  ..., 196, 194, 192],
          [211, 210, 210,  ..., 191, 190, 188]]],


        [[[ 69,  69,  69,  ...,  59,  58,  60],
          [ 69,  69,  69

In [8]:
import torch.utils.data.dataloader

train_data, test_data = torch.utils.data.random_split(dataset=data_torch, lengths=([0.8,0.2]))

train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=3, drop_last=True)
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=3, drop_last=True)
print(len(train_data_loader))
print(len(test_data_loader))

186
46


In [9]:
# arsitektur model
from torch import nn
import timm

class arsitekturModel(nn.Module):
    def __init__(self, num_classes, hidden_size, num_lstm_layer):
        super().__init__()
        self.resnet = timm.create_model('mobilenetv4_hybrid_medium', pretrained=True, features_only=True)
        self.avgPool = nn.AdaptiveAvgPool2d((2,2))
        self.lstm = nn.LSTM(128, hidden_size, num_lstm_layer, batch_first = True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        batch, num_frames, channels, height, width = x.shape
        x = torch.reshape(x, (-1, *x.shape[2:]))
        x1,x2,x3,x4,x5 = self.resnet(x)
        
        x = self.avgPool(x1)
        print(x.shape)
        x = nn.Flatten()(x)
        x = torch.reshape(x, (batch, num_frames, -1))
        x, (h0,c0) = self.lstm(x)
        x = h0[-1, ...]
        
        x = self.fc(x)
        x = self.softmax(x)
        
        return x
        

In [10]:
model = arsitekturModel(num_classes=100, hidden_size=20, num_lstm_layer=1)

Unexpected keys (classifier.bias, classifier.weight, conv_head.weight, norm_head.bias, norm_head.num_batches_tracked, norm_head.running_mean, norm_head.running_var, norm_head.weight) found while loading pretrained weights. This may be expected if model is being adapted.


In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
model.to(device=device)

cuda:0


arsitekturModel(
  (resnet): MobileNetV3Features(
    (conv_stem): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU(inplace=True)
    (blocks): Sequential(
      (0): Sequential(
        (0): EdgeResidual(
          (conv_exp): Conv2d(32, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn1): BatchNormAct2d(
            128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): ReLU(inplace=True)
          )
          (aa): Identity()
          (se): Identity()
          (conv_pwl): Conv2d(128, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn2): BatchNormAct2d(
            48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): Identity()
          )
          (drop_path): Identity()
     

In [26]:
# train
num_classes = 5
batch_size = 4
num_frames = 50
hidden_size = 128
num_lstm_layers = 2
epoch = 100

num_steps = len(train_data_loader)
iterator = iter(train_data_loader)
count_steps = 1   
loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4) 

for i in range(1, epoch):
    running_loss = 0.
    train_correct = 0
    img_batch, label_batch, label_name = next(iterator)
    img_batch, label_batch = img_batch.to(device), label_batch.to(device)
    optimizer.zero_grad()
    output_batch = model(img_batch.float())
    print(output_batch.shape)
    print(label_batch.shape)
    loss = loss_fn(output_batch, label_batch.long())
    
    

torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2])
torch.Size([3, 100])
torch.Size([3])
torch.Size([291, 32, 2, 2

In [15]:
print(len(iterator))

186
