In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import seaborn as sns
from IPython.display import YouTubeVideo
import matplotlib.pyplot as plt
import plotly.plotly as py

In [2]:
import os
from glob import glob
from tqdm import tqdm

Let's try just using Video-level data

In [63]:
# distribution of labels
video_files = glob("mys3bucket/yt8pm_100th_shard/v2/video/train*")

vid_ids = []
labels = []
mean_rgb = []
mean_audio = []

for file in video_files:
    for example in tf.python_io.tf_record_iterator(file):
        tf_example = tf.train.Example.FromString(example)

        vid_ids.append(tf_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
        labels.append(tf_example.features.feature['labels'].int64_list.value)
        mean_rgb.append(tf_example.features.feature['mean_rgb'].float_list.value)
        mean_audio.append(tf_example.features.feature['mean_audio'].float_list.value)

print('Number of videos in Sample data set: %s' % str(len(vid_ids)))
print('Picking a youtube video id: %s' % vid_ids[13])
print('List of label ids for youtube video id %s, are - %s' % (vid_ids[13], str(labels[13])))
print('First 20 rgb feature of a youtube video (',vid_ids[13],'): are - %s' % str(mean_rgb[13][:20]))

Number of videos in Sample data set: 41393
Picking a youtube video id: YwbF
List of label ids for youtube video id YwbF, are - [0, 12]
First 20 rgb feature of a youtube video ( YwbF ): are - [0.7414522171020508, -1.0128370523452759, -0.2103247493505478, -0.6396752595901489, -0.8331801295280457, -0.0706188753247261, 1.1849571466445923, -0.6760722994804382, -0.969638466835022, 0.08704043924808502, 1.6186580657958984, 0.8913909196853638, 0.05266544222831726, 0.7622855305671692, -1.2969056367874146, 1.1235600709915161, 0.08802083134651184, -0.42791053652763367, 1.0219056606292725, -0.6768688559532166]


## Bi-LSTM video classification

In [None]:
# FILIP : not really bi-LSTM without the temporal aspect of frame-level features...

In [2]:
# keras imports
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.callbacks import TensorBoard
from keras.models import load_model
from keras.models import Model
import operator
import time 
import gc
import os

Using TensorFlow backend.


In [4]:
## creating training and dev set

In [6]:
def create_train_dev_dataset(video_rgb, video_audio, labels):
    """
    Method to created training and validation data
    """
    
    #Convert video and audio data to np.arrays:
    video_rgb = np.array(video_rgb)
    video_audio = np.array(video_audio)
    labels = np.array(labels)
    
    shuffle_indices = np.random.permutation(np.arange(len(labels)))
    video_rgb_shuffled = video_rgb[shuffle_indices]
    video_audio_shuffled = video_audio[shuffle_indices]
    labels_shuffled = labels[shuffle_indices]

    dev_idx = max(1, int(len(labels_shuffled) * validation_split_ratio))

    del video_rgb
    del video_audio
    gc.collect()

    train_video_rgb, val_video_rgb = video_rgb_shuffled[:-dev_idx], video_rgb_shuffled[-dev_idx:]
    train_video_audio, val_video_audio = video_audio_shuffled[:-dev_idx], video_audio_shuffled[-dev_idx:]
    
    train_labels, val_labels = labels_shuffled[:-dev_idx], labels_shuffled[-dev_idx:]
    
    del video_rgb_shuffled, video_audio_shuffled, labels_shuffled
    gc.collect()
    
    return (train_video_rgb, train_video_audio, train_labels, val_video_rgb, val_video_audio, val_labels)

In [7]:
## Defining Model parameters and creating architecture

In [3]:
###TOMMY: Try specifying an input shape for each of these

max_frame_rgb_sequence_length = 10
frame_rgb_embedding_size = 1024

max_frame_audio_sequence_length = 10
frame_audio_embedding_size = 128

number_dense_units = 1000
number_lstm_units = 100
rate_drop_lstm = 0.2
rate_drop_dense = 0.2
activation_function='relu'
validation_split_ratio = 0.2

label_feature_size = 20

def create_model():
    """Create and store best model at `checkpoint` path ustilising bi-lstm layer for frame level data of videos"""
    # Filip: without the frame-level data, we don't actually have a bi-LSTM
    
    # Creating 2 bi-lstm layer, one for rgb and other for audio level data
#     lstm_layer_1 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
#     lstm_layer_2 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
    
    # creating input layer for frame-level data
    # FILIP: these below are frame-level features
#     frame_rgb_sequence_input = Input(shape=(max_frame_rgb_sequence_length, frame_rgb_embedding_size), dtype='float32')
#     frame_audio_sequence_input = Input(shape=(max_frame_audio_sequence_length, frame_audio_embedding_size), dtype='float32')
    
#     frame_x1 = lstm_layer_1(frame_rgb_sequence_input)
#     frame_x2 = lstm_layer_2(frame_audio_sequence_input)
    
    ### - Below un-deleted
    #creating input layer for video-level data 
    vid_shape=(1024,)
    video_rgb_input = Input(shape=vid_shape)
#     vid_rgb_shape = video_rgb_input[0].shape ###TOMMY
#     print(vid_rgb_shape)
    video_rgb_dense = Dense(int(number_dense_units/2), activation=activation_function, input_shape=vid_shape)(video_rgb_input)
    
#     aud_shape=(128,)
#     video_audio_input = Input(shape=aud_shape)
# #     vid_audio_shape = tf.cast(video_audio_input[0].shape, tf.int64) ###TOMMY
#     video_audio_dense = Dense(int(number_dense_units/2), activation=activation_function,input_shape = aud_shape)(video_audio_input)
#     ### - Above un-deleted
    
    # merging frame-level bi-lstm output and later passed to dense layer by applying batch-normalisation and dropout
#     merged_frame = concatenate([frame_x1, frame_x2])
#     merged_frame = BatchNormalization()(merged_frame)
#     merged_frame = Dropout(rate_drop_dense)(merged_frame)
#     merged_frame_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_frame)
    
    
    ### - Below un-deleted
    # merging video-level dense layer output
#     merged_video = concatenate([video_rgb_dense, video_audio_dense])
    merged_video = BatchNormalization()(video_rgb_dense)
    merged_video = Dropout(rate_drop_dense)(merged_video)
    merged_video_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_video)
    ### - Above un-deleted
    
    # merging frame-level and video-level dense layer output
    merged = merged_video_dense#merged_frame_dense#concatenate([merged_frame_dense, merged_video_dense])
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
     
    merged = Dense(number_dense_units, activation=activation_function)(merged)
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
    preds = Dense(label_feature_size, activation='sigmoid')(merged)
    
    model = Model(inputs=video_rgb_input, outputs=preds)
    print(model.summary())
    
    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['acc'])
#     early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    
#     STAMP = 'lstm_%d_%d_%.2f_%.2f' % (number_lstm_units, number_dense_units, rate_drop_lstm, rate_drop_dense)

#     checkpoint_dir = 'checkpoints/' + str(int(time.time())) + '/'

#     if not os.path.exists(checkpoint_dir):
#         os.makedirs(checkpoint_dir)

#     bst_model_path = checkpoint_dir + STAMP + '.h5'
#     model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False)
#     tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time()))
    
        
    return model

In [36]:
def get_train_data(video_rgb, video_audio, labels):
    return create_train_dev_dataset(video_rgb, video_audio, labels) 

In [64]:
video_rgb = mean_rgb
video_audio = mean_audio
vid_ids = vid_ids

# frame_rgb = feat_rgb
# frame_audio = feat_audio
labels = labels

In [65]:
#labels

In [6]:
number = tf.convert_to_tensor_or_indexed_slices(video_rgb[0][1],dtype=tf.float32)
tf.Session().run(number)

NameError: name 'video_rgb' is not defined

In [4]:
model = create_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               512500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 500)               2000      
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 500)               250500    
_________________________________________________________________
batch_normalization_2

In [5]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import *
import pyspark

In [6]:
conf = SparkConf().setMaster('local').setAppName('TFrecords_loading').set("spark.jars", "ecosystem/spark/spark-tensorflow-connector/target/spark-tensorflow-connector_2.11-1.10.0.jar")
sc = SparkContext(conf = conf)

spark = pyspark.sql.SparkSession(sc)

In [7]:
train_path = "mys3bucket/yt8pm_100th_shard/v2/video/train*.tfrecord"
df = spark.read.format("tfrecords").option("recordType", "Example").load(train_path)


In [11]:
test_path = "mys3bucket/yt8pm_100th_shard/v2/video/test*.tfrecord"
df_test = spark.read.format("tfrecords").option("recordType", "Example").load(test_path)

val_path = "mys3bucket/yt8pm_100th_shard/v2/video/validate*.tfrecord"
df_val = spark.read.format("tfrecords").option("recordType", "Example").load(val_path)


In [8]:
from elephas.spark_model import SparkModel



In [9]:
df.show()

+--------------------+--------------------+--------------------+----+
|          mean_audio|              labels|            mean_rgb|  id|
+--------------------+--------------------+--------------------+----+
|[-1.2556146, 0.17...|             [0, 12]|[0.5198898, 0.301...|eXbF|
|[-0.32460678, -0....|[16, 25, 189, 645...|[-0.5026991, -1.6...|BFbF|
|[-1.7352352, 1.83...|[2, 44, 64, 113, ...|[0.24258906, 0.97...|GqbF|
|[0.7349236, 1.268...|                 [3]|[-0.026906455, -0...|XabF|
|[1.2375641, -0.14...|              [1, 5]|[-0.45482802, -1....|3mbF|
|[0.50689745, 0.02...|                [14]|[0.45552492, 0.64...|S6bF|
|[0.71223485, 1.41...|      [3, 4, 13, 54]|[-0.022711225, 0....|mXbF|
|[-0.662256, -0.97...|           [11, 579]|[0.3059462, 0.947...|7sbF|
|[0.9852263, 0.145...|[2, 76, 227, 474,...|[0.24360302, 0.40...|H1bF|
|[0.10193015, -0.9...| [49, 80, 265, 2063]|[0.60755104, 0.43...|fxbF|
|[0.2003511, -1.10...|     [0, 1, 36, 132]|[-0.4298068, -1.0...|w1bF|
|[-0.5392725, -0.9..

In [10]:
train_df = df.select('mean_rgb', 'labels')

In [11]:
train_df.show()

+--------------------+--------------------+
|            mean_rgb|              labels|
+--------------------+--------------------+
|[0.5198898, 0.301...|             [0, 12]|
|[-0.5026991, -1.6...|[16, 25, 189, 645...|
|[0.24258906, 0.97...|[2, 44, 64, 113, ...|
|[-0.026906455, -0...|                 [3]|
|[-0.45482802, -1....|              [1, 5]|
|[0.45552492, 0.64...|                [14]|
|[-0.022711225, 0....|      [3, 4, 13, 54]|
|[0.3059462, 0.947...|           [11, 579]|
|[0.24360302, 0.40...|[2, 76, 227, 474,...|
|[0.60755104, 0.43...| [49, 80, 265, 2063]|
|[-0.4298068, -1.0...|     [0, 1, 36, 132]|
|[0.17366871, 0.95...|       [39, 50, 503]|
|[0.089333594, 0.0...|           [3, 2195]|
|[0.7414522, -1.01...|             [0, 12]|
|[-0.25524384, 0.0...|      [61, 227, 474]|
|[-0.1797692, -1.2...|           [5, 3226]|
|[-0.19473058, 0.6...|   [21, 23, 24, 758]|
|[0.68875366, 0.31...|      [39, 156, 202]|
|[-0.0593875, -0.7...|       [5, 644, 769]|
|[-0.9477386, 0.26...|[21, 23, 2

In [12]:
train_rdd = train_df.rdd

In [13]:
train_rdd.take(1)[0][1]

[0, 12]

In [14]:
train_rdd = train_rdd.map(lambda x: (x[0], x[1]))

In [15]:
train_rdd.take(1)

[([0.519889771938324,
   0.30175963044166565,
   -0.51358562707901,
   0.41406428813934326,
   -0.08573680371046066,
   -0.9000590443611145,
   1.1310861110687256,
   -0.7618379592895508,
   -0.5756487250328064,
   -0.6951144933700562,
   0.836911678314209,
   1.244982123374939,
   -0.7594509124755859,
   0.55865079164505,
   -1.3553019762039185,
   -0.7772969007492065,
   0.3180142641067505,
   0.7014186382293701,
   0.6312850713729858,
   0.0931776612997055,
   -1.0130456686019897,
   0.28573235869407654,
   0.9530810713768005,
   -0.10574248433113098,
   0.2004808783531189,
   0.6593612432479858,
   -0.2724944055080414,
   0.21014270186424255,
   0.6034362316131592,
   0.5530810356140137,
   0.054757654666900635,
   0.20218589901924133,
   0.975360095500946,
   0.4282728433609009,
   -0.6646512746810913,
   0.38973918557167053,
   0.24117425084114075,
   0.104885533452034,
   -0.035267919301986694,
   0.6658403277397156,
   0.47544535994529724,
   0.6839136481285095,
   -0.479144006

In [16]:
def convert_labels(labels):
    allowed=np.arange(20)
    one_hot=np.zeros(20)
    for l in labels:
        if l in allowed:
            one_hot[l]=1
    return one_hot

In [17]:
# train_rdd = train_rdd.map(lambda x: np.array([]))

In [18]:
train_rdd = train_rdd.map(lambda x: (np.array(x[0]), convert_labels(x[1])))

In [19]:
train_rdd.take(2)

[(array([ 0.51988977,  0.30175963, -0.51358563, ...,  0.44089007,
          0.39803699, -0.48050806]),
  array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0.])),
 (array([-0.50269908, -1.64767921,  0.25519568, ..., -0.00722509,
          0.06011974,  0.2744202 ]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0.]))]

In [73]:
train_rdd.mapPartitions(lambda x: x).collect()

KeyboardInterrupt: 

In [20]:
spark_model = SparkModel(model, frequency='epoch', mode='synchronous')

In [21]:
spark_model.fit(train_rdd, epochs=10, batch_size=32, verbose=0)

>>> Fit model


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 10 in stage 6.0 failed 1 times, most recent failure: Lost task 10.0 in stage 6.0 (TID 56, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/elephas/worker.py", line 45, in train
    self.model.fit(x_train, y_train, **self.train_config)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/engine/training_arrays.py", line 199, in fit_loop
    outs = f(ins_batch)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
    fetched = self._callable_fn(*array_vals)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1439, in __call__
    run_metadata_ptr)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[32,500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node batch_normalization_1_10/batchnorm/mul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node metrics_21/acc/Mean}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:467)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/elephas/worker.py", line 45, in train
    self.model.fit(x_train, y_train, **self.train_config)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/engine/training_arrays.py", line 199, in fit_loop
    outs = f(ins_batch)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
    fetched = self._callable_fn(*array_vals)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1439, in __call__
    run_metadata_ptr)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[32,500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node batch_normalization_1_10/batchnorm/mul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node metrics_21/acc/Mean}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [68]:
shuffle_indices = np.random.permutation(np.arange(len(labels)))
vid_sample = np.array(video_rgb)[shuffle_indices]

In [69]:
train_video_rgb, train_video_audio, train_labels, val_video_rgb, val_video_audio, val_labels =\
get_train_data(video_rgb, video_audio, labels)

In [70]:
#Setting callbacks
STAMP = 'lstm_%d_%d_%.2f_%.2f' % (number_lstm_units, number_dense_units, rate_drop_lstm, rate_drop_dense)
checkpoint_dir = 'checkpoints/' + str(int(time.time())) + '/'
bst_model_path = checkpoint_dir + STAMP + '.h5'
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False)
tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time()))

In [71]:
# need to hot-encode labels into 20 TOP frequent categories (last one will be dummy)
all_cats = train_labels.copy()

In [72]:
all_cats

array([[3, 4, 13], [106], [3, 8, 89], ..., [5, 2411], [315, 505],
       [3, 6, 13]], dtype=object)

In [73]:
all_labels = []
for i in list(all_cats):
    for j in list(i):
        all_labels.append(j)

In [74]:
results = np.unique(all_labels,return_counts=True)
labels,counts = results

In [77]:
counts[:20],labels[:20]

(array([6797, 4675, 3571, 3141, 2433, 2062, 1688, 1643, 1518, 1322, 1237,
        1189, 1175, 1081,  950,  888,  856,  831,  732,  713]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]))

In [79]:
train_labels[0]

[3, 4, 13]

In [88]:
x = np.array([1,2,3])
np.where(x==1)

(array([0]),)

In [91]:
arr = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])

In [111]:
arr

0.0

In [123]:
# I need to create a sequence-like matrix where for each input I will have a sequence of 20 of 0s and 1s, 
# indicating which feature is there and which is not

def create_y(raw_labels=train_labels,label_size=20,labels_vocab=labels):
    
    labels = labels_vocab[:label_size-1] #last columns will be 1 if none of those labels found in a video
    output = []
    for set_of_labels in raw_labels:
        
        # preallocate numpy arr for each set of labels
        sequence = np.zeros(label_size)
        # loop through all the labels in one video and flip them to 1s
        for this_label in set_of_labels:
            designation = np.where(labels==this_label)
            for des in designation:
                sequence[des]=1
        # done with one training points
        if sequence.sum()==0:
            sequence[-1]=1
        output.append(sequence)
    return output

In [124]:
seq = create_y(train_labels,20,labels)

In [125]:
train_label_seq = np.stack(seq)

In [126]:
train_label_seq.shape

(33115, 20)

In [127]:
val_labels_seq = np.stack(create_y(val_labels,20,labels))

In [128]:
val_labels_seq.shape

(8278, 20)

In [134]:
model.fit([train_video_rgb, train_video_audio], train_label_seq,
              validation_data=([val_video_rgb, val_video_audio], val_labels_seq),
              epochs=5, batch_size=64, shuffle=True, callbacks=[early_stopping, tensorboard]) # got rid of checkpoints

Train on 33115 samples, validate on 8278 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa3e5be2d68>