In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import seaborn as sns
from IPython.display import YouTubeVideo
import matplotlib.pyplot as plt
import plotly.plotly as py
import multiprocessing as mp # if we want to parallelize i/o

# keras imports
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.callbacks import TensorBoard
from keras.models import load_model
from keras.models import Model
import operator
import time 
import gc
import os

import os
from glob import glob
from tqdm import tqdm
import sys
import timeit


from pyspark import SparkConf, SparkContext
from pyspark.sql.types import *
import pyspark
import numpy as np
from elephas.spark_model import SparkModel

Using TensorFlow backend.




In [2]:
conf = SparkConf().setAppName('Youtube-8M') \
                  .set("spark.jars",
                       "ecosystem/spark/spark-tensorflow-connector/target/spark-tensorflow-connector_2.11-1.10.0.jar")
sc = SparkContext(conf = conf)
spark = pyspark.sql.SparkSession(sc)

In [3]:
!ls mys3bucket/converted_records_for_spark/

train0093.tfrecord-converted.tfrecord  train0208.tfrecord-converted.tfrecord
train0111.tfrecord-converted.tfrecord  train0434.tfrecord-converted.tfrecord


In [151]:
df = spark.read.format("tfrecords").option("recordType", "SequenceExample").load('mys3bucket/converted_records_for_spark/*')

In [152]:
df.count()

4029

In [153]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|          mean_audio|              labels|            mean_rgb|           frame_rgb|         frame_audio|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|[-1.185913, -0.84...|[0.0, 0.0, 1.0, 0...|[0.53471446, 1.00...|[WrappedArray(238...|[WrappedArray(86....|
|[0.9230174, -0.34...|[0.0, 0.0, 0.0, 0...|[-0.10382305, -1....|[WrappedArray(51....|[WrappedArray(192...|
|[-0.32460678, -0....|[0.0, 0.0, 0.0, 0...|[-0.5026991, -1.6...|[WrappedArray(0.0...|[WrappedArray(173...|
|[0.9312072, -0.73...|[0.0, 0.0, 0.0, 0...|[-0.6032936, -0.4...|[WrappedArray(0.0...|[WrappedArray(163...|
|[0.11735498, 1.13...|[0.0, 0.0, 0.0, 0...|[0.61926347, 0.20...|[WrappedArray(121...|[WrappedArray(121...|
|[-0.83108944, -1....|[0.0, 0.0, 0.0, 0...|[0.48174715, 0.67...|[WrappedArray(73....|[WrappedArray(75....|
|[-0.6435339, -0.6...|[1.0, 1.0, 0.0,

In [154]:
train_rdd = df.rdd

In [155]:
# Change the order of vars to fit the order in the model!
# train_frame_rgb, train_frame_audio, train_video_rgb, train_video_audio], train_labels
# 3, 4, 2, 0 , 1

# NOTE: ELEPHAS DOES NOT SUPPORT MULTI-INPUT OR THE FORMATTING IS NOT CORRECT.
train_rdd_new = train_rdd.map(lambda x: ((np.array(x[3]),np.array(x[4]),np.array(x[2]),np.array(x[0])),np.array(x[1])))

In [156]:
# this works - only frame-level RGB
train_rdd_simple = train_rdd.map(lambda x: (np.array(x[3]),np.array(x[1])))

In [157]:
sample = train_rdd_new.take(1)

In [158]:
len(sample[0][0])

4

In [159]:
###########
#SPECIFY PARAMS FIRST
###########

# 1000 class problem for now?
label_feature_size = 10

# how many frames we will use from each video?
max_frame_rgb_sequence_length = 10
frame_rgb_embedding_size = 1024

# how many audio sequences we will use from each video?
max_frame_audio_sequence_length = 10
frame_audio_embedding_size = 128

number_dense_units = 1000
number_lstm_units = 100
rate_drop_lstm = 0.2
rate_drop_dense = 0.2
activation_function='relu'
validation_split_ratio = 0 # to use all

def create_model_simple():
    """Create and store best model at `checkpoint` path ustilising bi-lstm layer for frame level data of videos"""
    
    # Creating 2 bi-lstm layer, one for rgb and other for audio level data
    lstm_layer_1 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
    #lstm_layer_2 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
    
    # creating input layer for frame-level data
    frame_rgb_sequence_input = Input(shape=(max_frame_rgb_sequence_length, frame_rgb_embedding_size), dtype='float32')
    #frame_audio_sequence_input = Input(shape=(max_frame_audio_sequence_length, frame_audio_embedding_size), dtype='float32')
    frame_x1 = lstm_layer_1(frame_rgb_sequence_input)
    #frame_x2 = lstm_layer_2(frame_audio_sequence_input)
    
    #creating input layer for video-level data 
#     vid_shape=(1024,)
#     video_rgb_input = Input(shape=vid_shape)
#     video_rgb_dense = Dense(int(number_dense_units/2), activation=activation_function, input_shape=vid_shape)(video_rgb_input)
    
#     aud_shape=(128,)
#     video_audio_input = Input(shape=aud_shape)
#     video_audio_dense = Dense(int(number_dense_units/2), activation=activation_function,input_shape = aud_shape)(video_audio_input)
    
    # merging frame-level bi-lstm output and later passed to dense layer by applying batch-normalisation and dropout
    merged_frame = frame_x1#concatenate([frame_x1, frame_x2])
    merged_frame = BatchNormalization()(merged_frame)
    merged_frame = Dropout(rate_drop_dense)(merged_frame)
    merged_frame_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_frame)
    
#     # merging video-level dense layer output
#     merged_video = concatenate([video_rgb_dense, video_audio_dense])
#     merged_video = BatchNormalization()(video_rgb_dense)
#     merged_video = Dropout(rate_drop_dense)(merged_video)
#     merged_video_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_video)

    
    # merging frame-level and video-level dense layer output
    merged = merged_frame_dense#concatenate([merged_frame_dense, merged_video_dense])
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
     
    merged = Dense(number_dense_units, activation=activation_function)(merged)
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
    preds = Dense(label_feature_size, activation='sigmoid')(merged)
    
    model = Model(inputs=[frame_rgb_sequence_input], outputs=preds)

    print(model.summary())
    
    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['acc'])

    return model


def create_model():
    """Create and store best model at `checkpoint` path ustilising bi-lstm layer for frame level data of videos"""
    
    # Creating 2 bi-lstm layer, one for rgb and other for audio level data
    lstm_layer_1 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
    lstm_layer_2 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
    
    # creating input layer for frame-level data
    frame_rgb_sequence_input = Input(shape=(max_frame_rgb_sequence_length, frame_rgb_embedding_size), dtype='float32')
    frame_audio_sequence_input = Input(shape=(max_frame_audio_sequence_length, frame_audio_embedding_size), dtype='float32')
    frame_x1 = lstm_layer_1(frame_rgb_sequence_input)
    frame_x2 = lstm_layer_2(frame_audio_sequence_input)
    
    #creating input layer for video-level data 
    vid_shape=(1024,)
    video_rgb_input = Input(shape=vid_shape)
    video_rgb_dense = Dense(int(number_dense_units/2), activation=activation_function, input_shape=vid_shape)(video_rgb_input)
    
    aud_shape=(128,)
    video_audio_input = Input(shape=aud_shape)
    video_audio_dense = Dense(int(number_dense_units/2), activation=activation_function,input_shape = aud_shape)(video_audio_input)
    
    # merging frame-level bi-lstm output and later passed to dense layer by applying batch-normalisation and dropout
    merged_frame = concatenate([frame_x1, frame_x2])
    merged_frame = BatchNormalization()(merged_frame)
    merged_frame = Dropout(rate_drop_dense)(merged_frame)
    merged_frame_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_frame)
    
    # merging video-level dense layer output
    merged_video = concatenate([video_rgb_dense, video_audio_dense])
    merged_video = BatchNormalization()(video_rgb_dense)
    merged_video = Dropout(rate_drop_dense)(merged_video)
    merged_video_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_video)

    
    # merging frame-level and video-level dense layer output
    merged = concatenate([merged_frame_dense, merged_video_dense])
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
     
    merged = Dense(number_dense_units, activation=activation_function)(merged)
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
    preds = Dense(label_feature_size, activation='sigmoid')(merged)
    
    model = Model(inputs=[frame_rgb_sequence_input, frame_audio_sequence_input, video_rgb_input, video_audio_input], outputs=preds)

    print(model.summary())
    
    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['acc'])

    return model

In [160]:
keras_model= create_model_simple()
#keras_model= create_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_28 (InputLayer)        (None, 10, 1024)          0         
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 200)               900000    
_________________________________________________________________
batch_normalization_34 (Batc (None, 200)               800       
_________________________________________________________________
dropout_34 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_46 (Dense)             (None, 500)               100500    
_________________________________________________________________
batch_normalization_35 (Batc (None, 500)               2000      
_________________________________________________________________
dropout_35 (Dropout)         (None, 500)               0         
__________

In [161]:
spark_model = SparkModel(keras_model, frequency='batch', mode='synchronous')

In [162]:

history = spark_model.fit(train_rdd_simple, epochs=10, batch_size=2, verbose=0)

>>> Fit model


KeyboardInterrupt: 