In [14]:
dataset_root_path = "/data/ZHO/formats/ucf101_letters_only"


In [15]:
import pathlib
dataset_root_path = pathlib.Path(dataset_root_path)

In [16]:
video_count_train = len(list(dataset_root_path.glob("train/*/*.mp4")))
video_count_val = len(list(dataset_root_path.glob("val/*/*.mp4")))
video_count_test = len(list(dataset_root_path.glob("test/*/*.mp4")))
video_total = video_count_train + video_count_val + video_count_test
print(f"Total videos: {video_total}")

Total videos: 128


In [17]:
all_video_file_paths = (
    list(dataset_root_path.glob("train/*/*.mp4"))
    + list(dataset_root_path.glob("val/*/*.mp4"))
    + list(dataset_root_path.glob("test/*/*.mp4"))
)
all_video_file_paths[:5]

[PosixPath('/data/ZHO/formats/ucf101_letters_only/train/haa/Original_haa.mp4'),
 PosixPath('/data/ZHO/formats/ucf101_letters_only/train/haa/Ganzo_haa.mp4'),
 PosixPath('/data/ZHO/formats/ucf101_letters_only/train/khaa/Original_khaa.mp4'),
 PosixPath('/data/ZHO/formats/ucf101_letters_only/train/khaa/Ganzo_khaa.mp4'),
 PosixPath('/data/ZHO/formats/ucf101_letters_only/train/aeen/Original_aeen.mp4')]

In [18]:
class_labels = sorted({str(path).split("/")[6] for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")
print(len(id2label))

Unique classes: ['aeen', 'alif', 'baa', 'daad', 'daal', 'faa', 'gaen', 'haa', 'haa_1', 'hamza_wow', 'hamza_yaa', 'jeem', 'kaaf', 'khaa', 'laa', 'laam', 'meem', 'noon', 'qaaf', 'raa', 'saa', 'saad', 'seen', 'sheen', 'taa', 'taa_1', 'tua', 'wow', 'yaa', 'zaai', 'zaal', 'zua'].
32


In [19]:
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

In [7]:
dataset_root_path = "/data/ZHO/formats/ucf101_letters_only"


In [20]:
import os
from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification, VideoMAEImageProcessor, VideoMAEModel
import numpy as np
import torch

feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")

image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

mean = feature_extractor.image_mean
std = feature_extractor.image_std
resize_to = feature_extractor.size
print(resize_to)
num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

print(resize_to,clip_duration,num_frames_to_sample)

Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEModel: ['mask_token', 'decoder.decoder_layers.3.layernorm_after.weight', 'decoder.decoder_layers.1.intermediate.dense.bias', 'decoder.decoder_layers.2.attention.attention.value.weight', 'decoder.decoder_layers.2.attention.attention.query.weight', 'decoder.decoder_layers.1.attention.attention.query.weight', 'decoder.decoder_layers.3.attention.output.dense.bias', 'decoder.decoder_layers.2.intermediate.dense.weight', 'decoder.decoder_layers.3.intermediate.dense.bias', 'decoder.decoder_layers.0.attention.attention.value.weight', 'decoder.decoder_layers.1.attention.attention.value.weight', 'decoder.decoder_layers.3.output.dense.bias', 'decoder.decoder_layers.0.layernorm_before.weight', 'decoder.decoder_layers.0.output.dense.bias', 'decoder.decoder_layers.0.intermediate.dense.weight', 'decoder.decoder_layers.3.layernorm_before.bias', 'decoder.norm.bias', 'decoder.decoder_layers.0.attention.o

{'shortest_edge': 224}
{'shortest_edge': 224} 2.1333333333333333 16


In [21]:
batch_size = 16
num_workers = 1

In [22]:
# Training dataset transformations.
train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    Resize((resize_to, resize_to)),
                    #RandomCrop(resize_to),
                    #RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)
# Training dataset.
train_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "train"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)
train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size= batch_size ,
            num_workers= num_workers,
        )
# Validation and evaluation datasets' transformations.
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize((resize_to, resize_to)),
                ]
            ),
        ),
    ]
)

# Validation and evaluation datasets.
val_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "val"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)
test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "test"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size = batch_size,
            num_workers = num_workers,
        )

In [1]:
from transformers import VideoMAEConfig, VideoMAEModel

# Initializing a VideoMAE videomae-base style configuration
configuration = VideoMAEConfig()

# Randomly initializing a model from the configuration
model = VideoMAEModel(configuration)

# Accessing the model configuration
configuration = model.config

2022-12-05 07:38:55.910327: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-05 07:38:56.022468: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-05 07:38:56.538005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-12-05 07:38:56.538037: W tensorflow/

In [2]:
configuration

VideoMAEConfig {
  "attention_probs_dropout_prob": 0.0,
  "decoder_hidden_size": 384,
  "decoder_intermediate_size": 1536,
  "decoder_num_attention_heads": 6,
  "decoder_num_hidden_layers": 4,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "videomae",
  "norm_pix_loss": true,
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_frames": 16,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.25.0.dev0",
  "tubelet_size": 2,
  "use_mean_pooling": true
}

In [3]:
#model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
from decord import VideoReader, cpu
import numpy as np

from transformers import VideoMAEImageProcessor, VideoMAEModel
from huggingface_hub import hf_hub_download


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))

# sample 16 frames
videoreader.seek(0)
indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader))
video = videoreader.get_batch(indices).asnumpy()

image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

# prepare video for the model
inputs = image_processor(list(video), return_tensors="pt")

# forward pass
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEModel: ['decoder.decoder_layers.2.layernorm_after.weight', 'decoder.norm.weight', 'decoder.decoder_layers.2.layernorm_before.weight', 'decoder.decoder_layers.0.attention.attention.key.weight', 'decoder.decoder_layers.3.attention.attention.key.weight', 'decoder.decoder_layers.1.output.dense.weight', 'decoder.decoder_layers.0.attention.attention.q_bias', 'decoder.decoder_layers.3.layernorm_before.bias', 'decoder.decoder_layers.2.attention.attention.key.weight', 'decoder.decoder_layers.2.intermediate.dense.weight', 'decoder.decoder_layers.0.layernorm_after.weight', 'decoder.decoder_layers.1.intermediate.dense.bias', 'decoder.decoder_layers.1.attention.attention.q_bias', 'decoder.decoder_layers.3.attention.output.dense.bias', 'mask_token', 'decoder.decoder_layers.3.layernorm_after.bias', 'decoder.decoder_layers.0.output.dense.weight', 'decoder.decoder_layers.3.output.dense.weight', 'decode

[1, 1568, 768]

In [4]:
model

VideoMAEModel(
  (embeddings): VideoMAEEmbeddings(
    (patch_embeddings): VideoMAEPatchEmbeddings(
      (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
  )
  (encoder): VideoMAEEncoder(
    (layer): ModuleList(
      (0): VideoMAELayer(
        (attention): VideoMAEAttention(
          (attention): VideoMAESelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=False)
            (key): Linear(in_features=768, out_features=768, bias=False)
            (value): Linear(in_features=768, out_features=768, bias=False)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): VideoMAESelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): VideoMAEIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation

In [9]:
outputs.last_hidden_state.shape

torch.Size([1, 1568, 768])

In [15]:
torch.backends.cudnn.version()

8302

In [16]:
torch.backends.cudnn.enabled 

True

In [12]:
import torch

In [13]:

optimizer = torch.optim.AdamW(model.parameters())


In [14]:
for i , _ in enumerate(train_dataloader):
    print(i)

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 32, in fetch
    data.append(next(self.dataset_iter))
  File "/usr/local/lib/python3.8/dist-packages/pytorchvideo/data/labeled_video_dataset.py", line 227, in __next__
    sample_dict = self._transform(sample_dict)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/transforms.py", line 94, in __call__
    img = t(img)
  File "/usr/local/lib/python3.8/dist-packages/pytorchvideo/transforms/transforms.py", line 30, in __call__
    x[self._key] = self._transform(x[self._key])
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/transforms.py", line 94, in __call__
    img = t(img)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/transforms.py", line 349, in forward
    return F.resize(img, self.size, self.interpolation, self.max_size, self.antialias)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/functional.py", line 432, in resize
    return F_t.resize(img, size=size, interpolation=interpolation.value, max_size=max_size, antialias=antialias)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/functional_tensor.py", line 496, in resize
    img = interpolate(img, size=[new_h, new_w], mode=interpolation, align_corners=align_corners, antialias=antialias)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py", line 3938, in interpolate
    return torch._C._nn.upsample_bilinear2d(input, output_size, align_corners, scale_factors)
TypeError: upsample_bilinear2d() received an invalid combination of arguments - got (Tensor, list, bool, NoneType), but expected one of:
 * (Tensor input, tuple of ints output_size, bool align_corners, tuple of floats scale_factors)
      didn't match because some of the arguments have invalid types: (Tensor, !list!, bool, !NoneType!)
 * (Tensor input, tuple of ints output_size, bool align_corners, float scales_h, float scales_w, *, Tensor out)



In [23]:
from pytorch_metric_learning import miners, losses
miner = miners.MultiSimilarityMiner()
loss_func = losses.TripletMarginLoss()

# your training loop
for i, (data, labels) in enumerate(train_dataloader):
    optimizer.zero_grad()
    embeddings = model(data)

    hard_pairs = miner(embeddings, labels)
    loss = loss_func(embeddings, labels, hard_pairs)
    print(loss)
    loss.backward()
    optimizer.step()

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 32, in fetch
    data.append(next(self.dataset_iter))
  File "/usr/local/lib/python3.8/dist-packages/pytorchvideo/data/labeled_video_dataset.py", line 227, in __next__
    sample_dict = self._transform(sample_dict)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/transforms.py", line 94, in __call__
    img = t(img)
  File "/usr/local/lib/python3.8/dist-packages/pytorchvideo/transforms/transforms.py", line 30, in __call__
    x[self._key] = self._transform(x[self._key])
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/transforms.py", line 94, in __call__
    img = t(img)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/transforms.py", line 349, in forward
    return F.resize(img, self.size, self.interpolation, self.max_size, self.antialias)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/functional.py", line 432, in resize
    return F_t.resize(img, size=size, interpolation=interpolation.value, max_size=max_size, antialias=antialias)
  File "/usr/local/lib/python3.8/dist-packages/torchvision/transforms/functional_tensor.py", line 496, in resize
    img = interpolate(img, size=[new_h, new_w], mode=interpolation, align_corners=align_corners, antialias=antialias)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py", line 3938, in interpolate
    return torch._C._nn.upsample_bilinear2d(input, output_size, align_corners, scale_factors)
TypeError: upsample_bilinear2d() received an invalid combination of arguments - got (Tensor, list, bool, NoneType), but expected one of:
 * (Tensor input, tuple of ints output_size, bool align_corners, tuple of floats scale_factors)
      didn't match because some of the arguments have invalid types: (Tensor, !list!, bool, !NoneType!)
 * (Tensor input, tuple of ints output_size, bool align_corners, float scales_h, float scales_w, *, Tensor out)



In [17]:
print(train_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7ff380090e50>


In [None]:
def train_dataloader(self):
    """
    Create the Kinetics train partition from the list of video labels
    in {self._DATA_PATH}/train.csv. Add transform that subsamples and
    normalizes the video before applying the scale, crop and flip augmentations.
    """
    train_transform = Compose(
        [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                UniformTemporalSubsample(8),
                Lambda(lambda x: x / 255.0),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320),
                RandomCrop(244),
                RandomHorizontalFlip(p=0.5),
                ]
            ),
            ),
        ]
    )
    train_dataset = pytorchvideo.data.Ucf101(
        data_path=os.path.join(self._DATA_PATH, "train.csv"),
        clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
        transform=train_transform
    )
    return torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )


In [1]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("torchlogs/")
#model = Net()
writer.add_graph(model, X)
writer.close()

2022-12-05 13:59:44.390603: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-05 13:59:44.491413: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-05 13:59:44.958666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-12-05 13:59:44.958699: W tensorflow/

NameError: name 'model' is not defined

In [7]:
img_path = '/data/ZHO/formats/ucf101_letters_only/'
annnotation_path = '/data/ZHO/formats/ucf101_letters_only/annotations/'

batch_size =24


In [5]:
import torchvision

In [11]:
import os
from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification, VideoMAEImageProcessor, VideoMAEModel
import numpy as np
import torch

feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")

image_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

mean = feature_extractor.image_mean
std = feature_extractor.image_std
resize_to = feature_extractor.size
print(resize_to)
num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

print(resize_to,clip_duration,num_frames_to_sample)

Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEModel: ['mask_token', 'decoder.decoder_layers.3.layernorm_after.weight', 'decoder.decoder_layers.1.intermediate.dense.bias', 'decoder.decoder_layers.2.attention.attention.value.weight', 'decoder.decoder_layers.2.attention.attention.query.weight', 'decoder.decoder_layers.1.attention.attention.query.weight', 'decoder.decoder_layers.3.attention.output.dense.bias', 'decoder.decoder_layers.2.intermediate.dense.weight', 'decoder.decoder_layers.3.intermediate.dense.bias', 'decoder.decoder_layers.0.attention.attention.value.weight', 'decoder.decoder_layers.1.attention.attention.value.weight', 'decoder.decoder_layers.3.output.dense.bias', 'decoder.decoder_layers.0.layernorm_before.weight', 'decoder.decoder_layers.0.output.dense.bias', 'decoder.decoder_layers.0.intermediate.dense.weight', 'decoder.decoder_layers.3.layernorm_before.bias', 'decoder.norm.bias', 'decoder.decoder_layers.0.attention.o

{'shortest_edge': 224}
{'shortest_edge': 224} 2.1333333333333333 16


In [12]:

# Training dataset transformations.
train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    Resize((resize_to, resize_to)),
                    #RandomCrop(resize_to),
                    #RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)


val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize((resize_to, resize_to)),
                ]
            ),
        ),
    ]
)

In [13]:
train_dataset = torchvision.datasets.UCF101(root = img_path, annotation_path= annnotation_path, frames_per_clip=16 , transform=train_transform)
test_dataset = torchvision.datasets.UCF101(root = img_path, annotation_path= annnotation_path, frames_per_clip=16 ,train = False, transform=val_transform)




FileNotFoundError: Found no valid file for the classes annotations, test, train, val. Supported extensions are: avi