In [50]:
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model

import time
import gluoncv as gcv
from gluoncv.utils import try_import_cv2
cv2 = try_import_cv2()

import warnings
warnings.filterwarnings('ignore')

#### For real time implementation on webcam requires some code changes to capture the video in 32 frames and passed it for inferencing. Use cv2.putText to place the predicted action on the video. However there will be a video lag playback depending on the GPU and inferencing speed. Due to a lack of decent GPU to perform real time inferencing. The real time inferencing is not implemented here.

In [None]:
### Code for real time webcam but not implemented.

# clip_input = []
# cap = cv2.VideoCapture(0)
# num_frames = 32

# for i in range(num_frames):
#     ret, frame = cap.read()
#     cv2.imshow('frame', frame)
#     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
#     clip_input.append(frame)
    
# # After the loop release the cap object
# cap.release()
# # Destroy all the windows
# cv2.destroyAllWindows()

### Use webcam to capture a sample video of me drinking to do the inferencing test

In [None]:
# define a video capture object
vid = cv2.VideoCapture(0)
time.sleep(1)

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter('output.avi', fourcc, 20.0, (640,  480))

while(True):
      
    # Capture the video frame
    # by frame
    ret, frame = vid.read()
  
    # Display the resulting frame
    
    # write the flipped frame
    out.write(frame)
    
    cv2.imshow('frame', frame)
      
    # the 'q' button is set as the
    # quitting button you may use any
    # desired button of your choice
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
  
# After the loop release the cap object
vid.release()
out.release()
# Destroy all the windows
cv2.destroyAllWindows()

### Extract the 16 frames from the video

In [68]:
from gluoncv.utils.filesystem import try_import_decord
decord = try_import_decord()

vr = decord.VideoReader('output.avi')
frame_id_list = range(0, 64, 2)
video_data = vr.get_batch(frame_id_list).asnumpy()
clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]

### Reszie the clips to 224x224 and stacked and reshaped for model input requirement

In [69]:
transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
clip_input = transform_fn(clip_input)
clip_input = np.stack(clip_input, axis=0)
clip_input = clip_input.reshape((-1,) + (32, 3, 224, 224))
clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

### Load model

In [76]:
# model_name = 'i3d_inceptionv1_kinetics400'
model_name = 'i3d_resnet50_v1_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)

Downloading C:\Users\lizar/.mxnet/models\i3d_resnet50_v1_kinetics400-568a722e.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/i3d_resnet50_v1_kinetics400-568a722e.zip...


100%|████████████████████████████████████████████████████████████████████████| 208483/208483 [00:24<00:00, 8379.80KB/s]


In [79]:
pred = net(nd.array(clip_input))

classes = net.classes
topK = 5
ind = nd.topk(pred, k=topK)[0].astype('int')
print('The input video clip is classified to be')
for i in range(topK):
    print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

The input video clip is classified to be
	[drinking], with probability 0.851.
	[drinking_beer], with probability 0.148.
	[drinking_shots], with probability 0.001.
	[tasting_beer], with probability 0.001.
	[gargling], with probability 0.000.
