# This is a live demo of video action recognition using two-stream architecture


This will clone my repo and download the models on drive and uses them to infer the output in a live frame-level demo. then an output video will be generated showing the output prediction for each frame accordingly.

I suggest running all the cells and have a 5 mins-break then view the output video :D

# Starting by installation process 


This will clone my repo and download the models on drive.
I used gdown.pl tool for downloading my public checkpoints on drive with no authentication overhead

In [1]:
import os
!git clone https://github.com/mohammed-elkomy/two-stream-action-recognition.git
os.chdir("/content/two-stream-action-recognition")

!git clone https://github.com/circulosmeos/gdown.pl.git
!./gdown.pl/gdown.pl https://drive.google.com/file/d/1djGzpxAYFvNX-UaQ7ONqDHGgnzc8clBK/view "spatial.zip" 
!./gdown.pl/gdown.pl https://drive.google.com/file/d/1kvslNL8zmZYaHRmhgAM6-l_pNDDA0EKZ/view "motion.zip"
!unzip spatial.zip
!unzip motion.zip

!pip install -U -q PyDrive 2> s.txt >> s.txt
!pip install opencv-python 2> s.txt >> s.txt
!pip install imgaug 2> s.txt >> s.txt
!pip install scikit-video 2> s.txt >> s.txt

Cloning into 'two-stream-action-recognition'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects:  16% (1/6)   [Kremote: Counting objects:  33% (2/6)   [Kremote: Counting objects:  50% (3/6)   [Kremote: Counting objects:  66% (4/6)   [Kremote: Counting objects:  83% (5/6)   [Kremote: Counting objects: 100% (6/6)   [Kremote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 224 (delta 2), reused 6 (delta 2), pack-reused 218[K
Receiving objects: 100% (224/224), 50.15 MiB | 47.02 MiB/s, done.
Resolving deltas: 100% (46/46), done.
Cloning into 'gdown.pl'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 81 (delta 22), reused 32 (delta 14), pack-reused 40[K
Unpacking objects: 100% (81/81), done.
Cannot open cookies file ‘gdown.cookie.temp’: No such file or directory
--2019-04-08 17:36:57--  https://d

# Showing the demo


It will start by selecting one random video of the videos **testing video samples** stored in my repo(it contains 100 videos from the test dataset and you can add more, originally it was possible to get a sinlge video by name using an HTTP request but the UCF101 changed their site a little bit  and it's not possible now)

In [0]:
high_resolution_video = True # for good internet :D

In [0]:
# importing those only once
import cv2
from imgaug import augmenters as iaa

from evaluation import legacy_load_model
from evaluation.evaluation import *

import random
from frame_dataloader import DataUtil

import matplotlib.pyplot as plt
import numpy as np

import skvideo.io

import io
import base64
from IPython.display import HTML

from matplotlib import gridspec

In [0]:
# defining functions and global objects

# dictionary of class names
data_util = DataUtil(path= './UCF_list/', split="01")
action_names =  {v:k for k,v in data_util.action_to_label.items()} # class name dictionary

stacked_frames = 10

# image resize augmenter to be fed into the network
augmenter = iaa.Sequential([
    iaa.Scale({"height": 299, "width": 299})
])


def convert_to_image(flow_image):
    """
    this is the conversion function of each flow frame
    based on the cpp version of extracting optical flow https://github.com/feichtenhofer/gpu_flow/blob/master/compute_flow.cpp
    """
    l, h = -20, 20
    return (255 * (flow_image - l) / (h - l)).astype(np.uint8)


def stack_opticalflow(start_frame_index, stacked_frames):  # returns numpy (h,w,stacked*2) = one sample
    """
    Stacks "stacked_frames" u/v frames on a single numpy array : (h,w,stacked*2)
    """
    first_optical_frame_u = original_u_frames[start_frame_index]  # horizontal
    first_optical_frame_v = original_v_frames[start_frame_index]  # vertical

    stacked_optical_flow_sample = np.zeros(first_optical_frame_u.shape + (2 * stacked_frames,), dtype=np.uint8)  # with channel dimension of  stacked_frames(u)+ stacked_frames(v)

    stacked_optical_flow_sample[:, :, 0] = first_optical_frame_u
    stacked_optical_flow_sample[:, :, 0 + stacked_frames] = first_optical_frame_v

    for index, optical_frame_id in enumerate(range(start_frame_index + 1, start_frame_index + stacked_frames), 1):  # index starts at 1 placed after the first one
        stacked_optical_flow_sample[:, :, index] = original_u_frames[optical_frame_id]
        stacked_optical_flow_sample[:, :, index + stacked_frames] = original_v_frames[optical_frame_id]

    return stacked_optical_flow_sample


def get_image_from_fig(fig):
    """
    converts matplotlib figure into a numpy array for demo video generation
    """
    fig.canvas.draw()

    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))

    return data


### loading keras models just downloaded from drive (loaded once)

In [5]:
# legacy_load_model is an older version of keras load_model since keras API changed a little bit when I was working on action recognition 

# load into ram
print("Loading Spatial stream")
spatial_model_restored = legacy_load_model(filepath="spatial.h5", custom_objects={'sparse_categorical_cross_entropy_loss': sparse_categorical_cross_entropy_loss, "acc_top_1": acc_top_1, "acc_top_5": acc_top_5})
spatial_model_restored.summary()


# load into ram
print("Loading Motion stream")
motion_model_restored = legacy_load_model(filepath="motion.h5", custom_objects={'sparse_categorical_cross_entropy_loss': sparse_categorical_cross_entropy_loss, "acc_top_1": acc_top_1, "acc_top_5": acc_top_5})
motion_model_restored.summary()

Loading Spatial stream
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_spatial (InputLayer)   (None, 299, 299, 3)       0         
_________________________________________________________________
data_norm (BatchNormalizatio (None, 299, 299, 3)       6         
_________________________________________________________________
xception (Model)             (None, 10, 10, 2048)      20861480  
_________________________________________________________________
avg_pool (GlobalAveragePooli (None, 2048)              0         
_________________________________________________________________
predictions (Dense)          (None, 101)               206949    
Total params: 21,068,435
Trainable params: 21,013,901
Non-trainable params: 54,534
________________________________________________

### Selecting one video and process it for RGB frames and optical flow frames
optical flow frames are computed using TVL1 which is never real time on CPU, might take few minutes for long vidoes (I process them on CPU since GPU version requires building opencv from the source and doing some nasty things not helpful for a short demo)

In [19]:
# select a random video
video_name = random.choice(os.listdir("testing video samples"))
selected_video=os.path.join("testing video samples",video_name)
print("selected_video:",selected_video)

vidcap = cv2.VideoCapture(selected_video)
print("frame rate for demo:",vidcap.get(cv2.CAP_PROP_FPS))

success, image = vidcap.read()

selected_video: testing video samples/v_HammerThrow_g23_c05.avi
frame rate for demo: 25.0


In [20]:
# make the rgb frames
original_rgb_frames = []

while success:
    original_rgb_frames.append(image)
    success, image = vidcap.read()

print("frames count in video", len(original_rgb_frames))

# make the optical flow frames
original_v_frames = []
original_u_frames = []

frames = list(map(lambda frame: cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0, original_rgb_frames))
optical_flow = cv2.optflow.DualTVL1OpticalFlow_create()

for frame_index in range(len(frames) - 1):
    if frame_index % 10 == 0:
        print("processing tvl flow of frame ",frame_index)

    flow = optical_flow.calc(frames[frame_index], frames[frame_index + 1], None)
    u_frame = convert_to_image(flow[..., 0])
    v_frame = convert_to_image(flow[..., 1])

    original_v_frames.append(v_frame)
    original_u_frames.append(u_frame)

print("original_rgb_frames:", len(original_rgb_frames), "original_u_frames:", len(original_u_frames), "original_v_frames:", len(original_v_frames))

# generate spatial batch as done in the dataloader
spatial_batch = []
for image in original_rgb_frames:
    spatial_batch.append(
        cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    )

spatial_batch = np.array(augmenter.augment_images(spatial_batch), dtype=np.float32) / 255.0

# generate motion batch as done in the dataloader
motion_batch = []

for first_optical_frame_id in range(len(original_u_frames) - stacked_frames):
    motion_batch.append(  # append one sample which is (h,w,stacked*2)
        stack_opticalflow(start_frame_index=first_optical_frame_id, stacked_frames=stacked_frames)
    )
motion_batch = np.array(augmenter.augment_images(motion_batch), dtype=np.float32) / 255.0

frames count in video 139
processing tvl flow of frame  0
processing tvl flow of frame  10
processing tvl flow of frame  20
processing tvl flow of frame  30
processing tvl flow of frame  40
processing tvl flow of frame  50
processing tvl flow of frame  60
processing tvl flow of frame  70
processing tvl flow of frame  80
processing tvl flow of frame  90
processing tvl flow of frame  100
processing tvl flow of frame  110
processing tvl flow of frame  120
processing tvl flow of frame  130
original_rgb_frames: 139 original_u_frames: 138 original_v_frames: 138


### Predict the output of each frame organized in the batch

In [0]:
"""
predict spatial stream output
"""
spatial_pred = spatial_model_restored.predict(spatial_batch)
spatial_classes = np.argsort(spatial_pred,axis=1)[:,:-6:-1]
spatial_scores = np.sort(spatial_pred,axis=1)[:,:-6:-1]
"""
predict motion stream output
"""
motion_pred = motion_model_restored.predict(motion_batch)
motion_classes = np.argsort(motion_pred,axis=1)[:,:-6:-1]
motion_scores = np.sort(motion_pred,axis=1)[:,:-6:-1]
"""
get the average output prediction
"""
average_pred = motion_pred + spatial_pred[:motion_pred.shape[0],]
average_classes = np.argsort(average_pred,axis=1)[:,:-6:-1]
average_scores = np.sort(average_pred,axis=1)[:,:-6:-1]

In [0]:
def make_bar_chart(classes,scores):
    height = scores.tolist()
    bars = [action_names[class_index] for class_index in classes]
    y_pos = np.arange(len(bars))
    
    bar = plt.bar(y_pos, height, color=['yellow', 'red', 'green', 'blue', 'cyan'])
    # plt.xticks(y_pos, bars, rotation=90) this will draw them below
    # plt.tick_params(axis="x",labelsize=10,direction="in", pad=-15)
    plt.ylim(top=1)  
    plt.ylim(bottom=0) 
    
    for bar_id,rect in enumerate(bar):
        plt.text(rect.get_x() + rect.get_width()/2.0, .5, bars[bar_id], ha='center', va='center', rotation=75,fontdict={'fontsize': 13 if high_resolution_video else 10})

    

In [23]:
# Define the codec and create VideoWriter object.The output is stored in 'demo.mp4' file.

writer = skvideo.io.FFmpegWriter("demo.mp4", inputdict={
      '-r': '16',
    })

gs = gridspec.GridSpec(2, 3,
                       width_ratios=[1, 1,1],
                       height_ratios=[1.5, 1]
                       )

gs.update(wspace=0.2,hspace=0)

# generating output video
for frame_index in range(motion_classes.shape[0]): 
    if high_resolution_video :
        fig = plt.figure(figsize=(16, 12))
        fig.suptitle("Demo for {}".format(video_name), fontsize=24)

        fig.text(.125,0.91,"Average Prediction from spatial stream: {}".format(action_names[np.mean(spatial_pred,axis = 0).argmax()]),color='r', fontsize=18)
        fig.text(.125,.87,"Average Prediction from motion stream: {}".format(action_names[np.mean(motion_pred,axis = 0).argmax()]),color='g',fontsize=18)
        fig.text(.125,.83,"Average Prediction from both streams: {}".format(action_names[np.mean(average_pred,axis = 0).argmax()]),color='b', fontsize=18)
    else :
        fig = plt.figure(figsize=(9, 6))
        fig.suptitle("Demo for {}".format(video_name), fontsize=16)

        fig.text(.125,0.91,"Average Prediction from spatial stream: {}".format(action_names[np.mean(spatial_pred,axis = 0).argmax()]),color='r', fontsize=13)
        fig.text(.125,.87,"Average Prediction from motion stream: {}".format(action_names[np.mean(motion_pred,axis = 0).argmax()]),color='g',fontsize=13)
        fig.text(.125,.83,"Average Prediction from both streams: {}".format(action_names[np.mean(average_pred,axis = 0).argmax()]),color='b', fontsize=13)
    

    if frame_index % 10 == 0:
        print("processing frame ",frame_index)
    ##########################################################
    # rgb frame
    ax = plt.subplot(gs[0])
    ax.set_title("RGB frame", fontsize=16 if high_resolution_video else 13)
    ax.get_yaxis().set_visible(False)
    ax.get_xaxis().set_visible(False)
    ax.imshow(cv2.cvtColor(original_rgb_frames[frame_index],cv2.COLOR_RGB2BGR))
    ##########################################################
    # optical flow frame
    ax = plt.subplot(gs[1])
    ax.set_title("TVL1 Optical flow u-frame", fontsize=16 if high_resolution_video else 13)
    ax.get_yaxis().set_visible(False)
    ax.get_xaxis().set_visible(False)
    ax.imshow(original_u_frames[frame_index],cmap="inferno") # viridis,inferno,plasma,magma
    ##########################################################
    # optical flow frame
    ax = plt.subplot(gs[2])
    ax.set_title("TVL1 Optical flow v-frame", fontsize= 16 if high_resolution_video else 13)

    ax.get_yaxis().set_visible(False)
    ax.get_xaxis().set_visible(False)
    ax.imshow(original_v_frames[frame_index],cmap="inferno") # viridis,inferno,plasma,magma
    ##########################################################
    # prediction scores
    ax = plt.subplot(gs[3])
    ax.set_title("Spatial Stream Output scores",fontsize= 16 if high_resolution_video else 13)

    make_bar_chart(spatial_classes[frame_index],spatial_scores[frame_index])
    ##########################################################
    # prediction scores
    ax = plt.subplot(gs[4])
    ax.set_title("Motion Stream Output scores",fontsize= 16 if high_resolution_video else 13)

    make_bar_chart(motion_classes[frame_index],motion_scores[frame_index])
    ##########################################################
    # prediction scores
    ax = plt.subplot(gs[5])
    ax.set_title("Average Output scores",fontsize= 16 if high_resolution_video else 13)

    make_bar_chart(average_classes[frame_index],average_scores[frame_index])
    ##########################################################
    fig.tight_layout( pad=0, h_pad=0, w_pad=0)
    writer.writeFrame(get_image_from_fig(fig))
    
    plt.close(fig)
    
writer.close()

processing frame  0




processing frame  10
processing frame  20
processing frame  30
processing frame  40
processing frame  50
processing frame  60
processing frame  70
processing frame  80
processing frame  90
processing frame  100
processing frame  110
processing frame  120


In [24]:
video = io.open("demo.mp4" , 'r+b').read()
encoded = base64.b64encode(video)

HTML(data='''<video controls autoplay loop>
			<source type="video/mp4" src="data:video/mp4;base64,{}"
      		</video>'''.format(encoded.decode('ascii')))