### Set up the environment

In [None]:
import os
import sagemaker
import boto3

from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
role = get_execution_role()

bucket  = 'YOUR_BUCKET'
prefix  = 'sagemaker/3d-densenet'
dataset = 'kth'

staging_dir = 'stage'
export_dir  = 'export'

### Prepare the data

In [None]:
import cv2
import errno
import random
import tensorflow as tf

from io import BytesIO
from PIL import Image

print('Using Tensorflow {0}'.format(tf.__version__))

s3 = boto3.client('s3')

num_frames_per_clip = 16 # The number of frames for a video clip
skip_frames = 10 # The number of frames to skip when we process the video
train_eval_split_factor =  0.75 # Use this factor to split the train (3/4) and eval data (1/4)
width = 128 # Image width
height = 128 # Image height
quality = 100 # Image quality
channel = 3 # Image color channel

def get_clips(image_list):
    # Given a list of images, return video clips of (num_frames_per_clip) consecutive frames as a list.
    video_clips = []
    images_len = len(image_list)
    if images_len < num_frames_per_clip:
        return video_clips

    # Prepare the first clip
    video_clips.append(image_list[:num_frames_per_clip])

    num_of_extra_clip = int((images_len - num_frames_per_clip) / skip_frames)
    for i in range(1, num_of_extra_clip + 1):
        start = i * skip_frames - 1
        end = start + num_frames_per_clip
        video_clips.append(image_list[start:end])

    return video_clips

def download_videos(origin_videos_location):    
    resp = s3.list_objects_v2(
        Bucket=bucket,
        Prefix=origin_videos_location)
    
    video_filenames = []
    for obj in resp['Contents']:
        # Create target directory & all intermediate directories if don't exists
        path, filename = os.path.split(obj['Key'])
        if not filename.endswith(('.avi', '.mp4')): continue
        try:
            os.makedirs('{0}/{1}'.format(staging_dir, path))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise
        s3.download_file(bucket, obj['Key'], '{0}/{1}'.format(staging_dir, obj['Key']))
        
def process_dataset(train_writer, eval_writer, origin_videos_location):
    download_videos(origin_videos_location)
    data_dir = os.path.join(os.path.join(os.getcwd(), staging_dir), origin_videos_location)
    label = -1
    # [class1, class2, class3, ..., class n]
    for class_dir in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_dir)
        if os.path.isdir(class_path):
            # Set the label value for this class, start from 0
            label += 1
            print("Processing class: " + str(label) + ", name: " + os.path.basename(class_path))
            # Process each video file in this class
            video_filenames = os.listdir(class_path)
            
            for video_filename in video_filenames[0:int(
                    train_eval_split_factor * len(video_filenames))]:
                process_video(train_writer, class_path, video_filename, label)
            for video_filename in video_filenames[
                    int(train_eval_split_factor *
                        len(video_filenames)):len(video_filenames)]:
                process_video(eval_writer, class_path, video_filename, label)        

def process_video(writer, class_path, video_filename, label):
    video_filename_path = os.path.join(class_path, video_filename)
    if video_filename_path.endswith(('.avi', '.mp4')):
        video_clips = _convert_video_to_clips(video_filename_path)
        # Convert the clip to tf record
        for clip in video_clips:
            tf_example = create_tf_example(raw=clip, label=label)
            writer.write(tf_example.SerializeToString())   
            
def _convert_video_to_clips(video_path):
    # Use opencv to read video to list of images
    video_images_list = []
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        # frame shape [height, width, channel]
        _, frame = cap.read()
        try:
            # pil_image shape [width, height, channel]
            pil_image = Image.fromarray(frame)
            # Resize the image and convert the image according to the channel information
            if channel == 1:
                pil_image = pil_image.resize((width, height),
                                             Image.NEAREST).convert('L')
            else:
                pil_image = pil_image.resize((width, height),
                                             Image.NEAREST)
            # Encode the image to JPEG
            with BytesIO() as buffer:
                pil_image.save(buffer, format="JPEG", quality=quality)
                video_images_list.append(buffer.getvalue())
        except AttributeError:
            # Fail to read the image
            break

    # Convert list of images to clips of images with type np.float32
    return get_clips(image_list=video_images_list)

def _bytelist_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _floatlist_feature(value):
    return tf.train.Feature(bytes_list=tf.train.FloatList(value=value))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def create_tf_example(raw, label):
    return tf.train.Example(
        features=tf.train.Features(
            feature={
                'clip/width': _int64_feature(width),
                'clip/height': _int64_feature(height),
                'clip/channel': _int64_feature(channel),
                'clip/raw': _bytelist_feature(raw),
                'clip/label': _int64_feature(label)
            }))

def get_total_video_clip_number(data_path):
    count = 0
    for _ in tf.python_io.tf_record_iterator(data_path):
        count += 1
    return count

In [None]:
data_path = '{0}/{1}'.format(staging_dir, 'tfrecords')
train_data_path = '{0}/{1}'.format(data_path, 'train.tfrecord') # Path to output train TFRecord
eval_data_path  = '{0}/{1}'.format(data_path, 'eval.tfrecord')  # Path to output eval TFRecord

os.makedirs(data_path)

# Write the dataset
train_writer = tf.python_io.TFRecordWriter(train_data_path)
eval_writer = tf.python_io.TFRecordWriter(eval_data_path)

process_dataset(
    train_writer=train_writer,
    eval_writer=eval_writer,
    origin_videos_location='{0}/{1}'.format(prefix, dataset))

train_writer.close()
eval_writer.close()

# Count the dataset record
train_total_video_clip = get_total_video_clip_number(train_data_path)
print("Total clips in train dataset: " + str(train_total_video_clip))

eval_total_video_clip = get_total_video_clip_number(eval_data_path)
print("Total clips in eval dataset: " + str(eval_total_video_clip))

### Upload the dataset

In [None]:
train_input = sagemaker_session.upload_data(
    path=train_data_path, bucket=bucket, key_prefix='{0}/train_data'.format(prefix))
eval_input = sagemaker_session.upload_data(
    path=eval_data_path, bucket=bucket, key_prefix='{0}/train_data'.format(prefix))

### Initialize script variables

In [None]:
train_instance_type = 'ml.m5.large'

custom_code_upload_location = 's3://{0}/{1}/train_code'.format(bucket, prefix)
model_artifacts_location = 's3://{0}/{1}/train_output'.format(bucket, prefix)

### Submit script for training

In [None]:
from sagemaker.tensorflow import TensorFlow

hparams = {
    'num_classes': 3,  # The number of the classes that this dataset had
    'batch_size': 10,
    'initial_learning_rate': 0.1,
    'decay_step': 1000,
    'lr_decay_factor': 0.1,  # Learning rate will decay by a factor for every decay_step
    'growth_rate': 12,  # Grows rate for every layer [12, 24, 40]
    'network_depth': 20,  # Depth of the whole network [20, 40, 250]
    'total_blocks': 3,  # Total blocks of layers stack
    'keep_prob': 0.9,  # Keep probability for dropout
    'weight_decay': 1e-4,  # Weight decay for L2 loss
    'model_type': 'DenseNet3D',
    'reduction': 0.5,  # Reduction rate at transition layer for the models
    'bc_mode': True,
    'num_frames_per_clip': num_frames_per_clip,  # The length of the video clip
    'width': width,
    'height': height,
    'channel': channel,
    'train_total_video_clip': train_total_video_clip, # This number is for KTH dataset with default setting
    'eval_total_video_clip': eval_total_video_clip # This number is for KTH dataset with default setting
}

action_estimator = TensorFlow(
    source_dir='estimator_source',
    entry_point='densenet_3d_estimator.py',
    role=role,
    output_path=model_artifacts_location,
    code_location=custom_code_upload_location,
    train_instance_count=1,
    train_volume_size=30,
    framework_version='1.11.0',
    train_instance_type=train_instance_type,
    training_steps=1,
    evaluation_steps=1,
    hyperparameters=hparams)

In [None]:
%%time

train_data_location = 's3://{0}/{1}/train_data'.format(bucket, prefix)
action_estimator.fit(train_data_location, run_tensorboard_locally=False)

### Create endpoint

In [None]:
action_predictor = action_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

### Invoke endpoint

In [None]:
import numpy as np

raw_video_clips = _convert_video_to_clips('test_videos/person02_running_d1_uncomp.avi')

for raw_video_clip in raw_video_clips:
    video_clip = []
    for raw_image in raw_video_clip:
        image = cv2.imdecode(np.frombuffer(raw_image, dtype=np.uint8), 1)
        video_clip.append(image)
    output_dict = action_predictor.predict( {'video_clips': [video_clip] } )
    print(output_dict)

### Cleanup

In [None]:
import shutil

if os.path.exists(os.path.join(os.getcwd(), export_dir)):
    shutil.rmtree(export_dir)
    
if os.path.exists(os.path.join(os.getcwd(), staging_dir)):
    shutil.rmtree(staging_dir)