In [26]:
import torch
import time
import numpy as np

# Choose the `x3d_s` model
model_name = 'x3d_xs'
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)

Using cache found in /data2/tungtx2/.cache/torch/hub/facebookresearch_pytorchvideo_main
Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/X3D_XS.pyth" to /data2/tungtx2/.cache/torch/hub/checkpoints/X3D_XS.pyth


  0%|          | 0.00/29.4M [00:00<?, ?B/s]

In [28]:
# Set to GPU or CPU
device = "cuda"
model = model.eval()
model = model.to(device)
print(sum(p.numel() for p in model.parameters()))

3794274


In [29]:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)

In [30]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [31]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [35]:
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 9,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,   # có tất cả 13 frames sau khi sample
        "sampling_rate": 6,  # 6 frame sample 1 lần
    },
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[model_name]

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second

In [36]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

In [37]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data_transformed = transform(video_data)

# Move the inputs to the desired device
inputs = video_data_transformed["video"]
inputs = inputs.to(device)
print(device)
print(inputs.shape)

cuda
torch.Size([3, 9, 182, 182])


In [29]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=9)

In [30]:
print(video_data['video'].shape)

torch.Size([3, 270, 240, 320])


In [31]:
from PIL import Image

img = video_data['video'][:, 150, :, :].squeeze()
img = img.permute(1, 2, 0).cpu().numpy()
img = (img).astype(np.uint8)
Image.fromarray(img).save('a.jpg')

In [38]:
# Pass the input clip through the model
preds = model(inputs[None, ...])
print(preds.shape)
# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

torch.Size([1, 400])
Top 5 predicted labels: archery, swinging on something, catching fish, abseiling, tying knot (not on a tie)


In [39]:
inputs[None, ...].shape

torch.Size([1, 3, 9, 182, 182])

In [40]:
import time

for _ in range(100):
    preds = model(inputs[None, ...])

ls_time = []
inp = inputs[None, ...]
for _ in range(200):
    start = time.perf_counter()
    preds = model(inp)
    duration = time.perf_counter() - start
    ls_time.append(duration)
    print('time: ', duration)

time:  0.010584637522697449
time:  0.010587416589260101
time:  0.010571297258138657
time:  0.010591801255941391
time:  0.010642018169164658
time:  0.010588988661766052
time:  0.010532133281230927
time:  0.010558705776929855
time:  0.010587714612483978
time:  0.010678272694349289
time:  0.010515183210372925
time:  0.010525703430175781
time:  0.010584317147731781
time:  0.01051100343465805
time:  0.010592576116323471
time:  0.010563179850578308
time:  0.010516002774238586
time:  0.010569557547569275
time:  0.010610073804855347
time:  0.010912492871284485
time:  0.010790873318910599
time:  0.010601475834846497
time:  0.011474568396806717
time:  0.01069733127951622
time:  0.010512519627809525
time:  0.010529927909374237
time:  0.010599169880151749
time:  0.010744195431470871
time:  0.010731671005487442
time:  0.010596714913845062
time:  0.01057353988289833
time:  0.010655093938112259
time:  0.010532494634389877
time:  0.010617826133966446
time:  0.010543085634708405
time:  0.01060834899544

In [18]:
time.perf_counter() - time.perf_counter()

-2.8312206268310547e-07

In [41]:
print(model)

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv2plus1d(
        (conv_t): Conv3d(3, 24, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
        (conv_xy): Conv3d(24, 24, kernel_size=(5, 1, 1), stride=(1, 1, 1), padding=(2, 0, 0), groups=24, bias=False)
      )
      (norm): BatchNorm3d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(24, 24, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(24, 54, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
            (norm_a): BatchNorm3d(54, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(54, 54, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), groups=54, bias=False)
            (nor

In [47]:
model.blocks[4].res_blocks[-1]

ResBlock(
  (branch2): BottleneckBlock(
    (conv_a): Conv3d(192, 432, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
    (norm_a): BatchNorm3d(432, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act_a): ReLU()
    (conv_b): Conv3d(432, 432, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=432, bias=False)
    (norm_b): Sequential(
      (0): BatchNorm3d(432, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): SqueezeExcitation(
        (block): Sequential(
          (0): Conv3d(432, 32, kernel_size=(1, 1, 1), stride=(1, 1, 1))
          (1): ReLU()
          (2): Conv3d(32, 432, kernel_size=(1, 1, 1), stride=(1, 1, 1))
          (3): Sigmoid()
        )
      )
    )
    (act_b): Swish()
    (conv_c): Conv3d(432, 192, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
    (norm_c): BatchNorm3d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (activation): ReLU()
)

In [48]:
import copy

cl_model = copy.deepcopy(model)

In [50]:
out = model.blocks(inp)

NotImplementedError: Module [ModuleList] is missing the required "forward" function