In [1]:
import numpy as np
import keras
import re
import os

import pyspark
from pyspark.sql.session import SparkSession

from keras.models import Sequential
from keras import layers
from keras.optimizers import SGD

import horovod.keras as hvd

Using TensorFlow backend.


In [2]:
timesteps = 10
dim = 3
samples = 300
batch_size = 128
n_epochs = 10
n_classes = 9

## setting up spark

In [3]:
def spark_session_setup():
    """
    creates a spark context
    >>> sc = spark_session_setup()
    """

    # in order to be bale to change log level
    conf = pyspark.SparkConf()
    conf.set('spark.logConf', 'true')
    conf.set('spark.executor.memory', '50G')
    conf.set('spark.driver.memory', '50G')
#     conf.set('spark.driver.maxResultSize', '10G')

    # create a spark session
    sc = pyspark.SparkContext(appName='word_count', conf=conf)

    # change log level to ERROR
    sc.setLogLevel("ERROR")
    return sc

sc = spark_session_setup()

## helper functions to load the dataset (from Marcus code)

In [4]:
def append_train_label(document):
    for i in range(len(train_names.value)):
        if document[0] == train_names.value[i]:
            return document[0], document[1], int(train_labels.value[i])

def match_test_label(document):
    for i in range(len(test_names.value)):
        if document[0] == test_names.value[i]:
            return int(test_labels.value[i])

def remove_train_line_id(document):
    text = ''
    for word in document[1].split():
        if len(word) <= 2:
            text += word + ' '
    return document[0], text, document[2]

def remove_test_line_id(document):
    text = ''
    for word in document[1].split():
        if len(word) <= 2:
            text += word + ' '
    return document[0], text


## Loading the dataset

In [5]:
spark = SparkSession(sc)

#Training Set
data = sc.wholeTextFiles('../dataset/data/bytes') #sys.argv[1]
fp = open('../dataset/files/X_small_train.txt')
train_names = fp.read().split()
file_path = 'file:' + os.path.realpath('../dataset/data/bytes') + '/' #sys.argv[1]
for i in range(len(train_names)):
    train_names[i] = file_path + train_names[i] + '.bytes'
train_names = sc.broadcast(train_names)

#Training Labels
fp = open('../dataset/files/y_small_train.txt')
train_labels = sc.broadcast(fp.read().split())

#Convert Training Data into a Data Frame
train_data = data.filter(lambda x: x[0] in train_names.value)
train_data = train_data.map(append_train_label)
train_data = train_data.map(remove_train_line_id)
train_df = train_data.toDF(['id', 'features', 'label'])

#Testing Set
fp = open('../dataset/files/X_small_test.txt')
test_names = fp.read().split()
file_path = 'file:' + os.path.realpath('../dataset/data/bytes') + '/' #sys.argv[1]
for i in range(len(test_names)):
    test_names[i] = file_path + test_names[i] + '.bytes'
test_names = sc.broadcast(test_names)

#Testing Labels
fp = open('../dataset/files/y_small_test.txt')
test_labels = sc.broadcast(fp.read().split())

#Convert Testing Data into a Data Frame
test_data = data.filter(lambda x: x[0] in test_names.value)
test_data = test_data.map(remove_test_line_id)
test_df = test_data.toDF(['id', 'text'])
matched_test_labels = test_data.map(match_test_label).collect()

In [6]:
train_df.show(1)

+--------------------+--------------------+-----+
|                  id|            features|label|
+--------------------+--------------------+-----+
|file:/home/saed/P...|EC 83 EC 28 B8 B4...|    3|
+--------------------+--------------------+-----+
only showing top 1 row



## Using Horvord library 
https://docs.databricks.com/applications/deep-learning/distributed-deep-learning/example/mnist-keras.html

In [7]:
def get_model(num_classes):
    model = keras.models.Sequential()
    model.add(layers.Conv2D(32, kernel_size=(3, 3),
                   activation='relu',
                   input_shape=(28, 28, 1)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))
    return model

### Initializing horovod

In [8]:
hvd.init()

0

In [9]:
learning_rate=1.0

In [10]:
optimizer = keras.optimizers.Adadelta(learning_rate * hvd.size())

Instructions for updating:
Colocations handled automatically by placer.


In [11]:
optimizer = hvd.DistributedOptimizer(optimizer)

In [12]:
model = get_model(10)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [13]:
model.compile(optimizer=optimizer,
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [14]:
callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]

### We have to convert our dataset ...

In [15]:
type(train_data)

pyspark.rdd.PipelinedRDD

In [16]:
model.fit(train_data, train_labels, 
            batch_size=batch_size,
            epochs=n_epochs,
            callbacks=callbacks, 
            verbose=2,
            validation_data=(test_data, test_labels))

AttributeError: 'PipelinedRDD' object has no attribute 'ndim'

### sample dataset that works with the model

In [None]:
# def get_dataset(num_classes=10, rank=0, size=1):
#     (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data('MNIST-data-%d' % rank)

#     # Get subset of data based on Horovod rank and size (gets full dataset for default arguments)
#     x_train = x_train[rank::size]
#     y_train = y_train[rank::size]
#     x_test = x_test[rank::size]
#     y_test = y_test[rank::size]

#     # Add channel dimension so it works with keras.layers.Conv2D
#     x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
#     x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)

#     # Convert uint8 to float 32 and scale to [0, 1]
#     x_train = x_train.astype('float32')
#     x_test = x_test.astype('float32')
#     x_train /= 255
#     x_test /= 255

#     # Convert class vectors to binary class matrices
#     y_train = keras.utils.to_categorical(y_train, num_classes)
#     y_test = keras.utils.to_categorical(y_test, num_classes)

#     return (x_train, y_train), (x_test, y_test)

# (x_train, y_train), (x_test, y_test) = get_dataset(10)