In [9]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging
from datetime import datetime
import argparse
import subprocess
import shlex
from imp import reload
from tensorflowonspark import TFCluster
from pyspark import SparkContext
from pyspark.conf import SparkConf

In [None]:
# Remove existing models/artifacts if any
subprocess.check_output('hdfs dfs -rm -R -f -skipTrash mnist', shell=True)
subprocess.check_output('hdfs dfs -rm -R -f -skipTrash mnist_model', shell=True)
subprocess.check_output('hdfs dfs -rm -R -f -skipTrash predictions', shell=True)
subprocess.check_output('rm -Rf mnist tensorflowonspark', shell=True)
subprocess.check_output('rm -f mnist.zip', shell=True)

In [None]:
# Clone the repo with the adjusted TF 1.11 APIs in mnist_dist/mnist_spark
subprocess.check_output('cd $MESOS_SANDBOX && git clone --single-branch -b leewyang_update_examples https://github.com/yahoo/tensorflowonspark', shell=True)

In [None]:
# Download the mnist example 
subprocess.check_output('cd $MESOS_SANDBOX && curl -fsSL -O https://downloads.mesosphere.com/data-science/assets/mnist.zip && unzip mnist.zip', shell=True)

In [None]:
# Create mnist data in csv2 format
subprocess.check_output('eval spark-submit ${SPARK_OPTS} --verbose tensorflowonspark/examples/mnist/mnist_data_setup.py --output mnist/tfr --format tfr', shell=True)

In [10]:
# Set the number of executors to the number of available GPU agents
num_ps = 0
num_executors = 5

parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100)
parser.add_argument("--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors)
parser.add_argument("--driver_ps_nodes", help="""run tensorflow PS node on driver locally.
    You will need to set cluster_size = num_executors + num_ps""", default=False)
parser.add_argument("--epochs", help="number of epochs", type=int, default=1)
parser.add_argument("--format", help="example format: (csv2|tfr)", choices=["csv2", "tfr"], default="tfr")
parser.add_argument("--images_labels", help="HDFS path to MNIST image_label files in parallelized format")
parser.add_argument("--mode", help="train|inference", default="train")
parser.add_argument("--model", help="HDFS path to save/load model during train/test", default="mnist_model")
parser.add_argument("--num_ps", help="number of ps nodes", default=num_ps)
parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("--rdma", help="use rdma connection", default=False)
parser.add_argument("--readers", help="number of reader/enqueue threads per worker", type=int, default=10)
parser.add_argument("--shuffle_size", help="size of shuffle buffer", type=int, default=1000)
parser.add_argument("--steps", help="maximum number of steps", type=int, default=500)
parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")

_StoreTrueAction(option_strings=['--tensorboard'], dest='tensorboard', nargs=0, const=True, default=False, type=None, choices=None, help='launch tensorboard process', metavar=None)

In [None]:
# # CPU Config
# conf = SparkConf().setAppName('Mnist-CPU') \
#                   .set('spark.mesos.executor.docker.image', 'fabianbaier/data-toolkit:latest-gpu') 


In [11]:
# GPU Config
conf = SparkConf().setAppName('Mnist-GPU') \
                 .set('spark.mesos.executor.docker.image', 'mesosphere/mesosphere-data-toolkit:latest-gpu') \
                 .set('spark.mesos.gpus.max', num_executors) \
                 .set('spark.mesos.executor.gpus', 1)

In [12]:
# Make sure you cloned the repo with the adjusted TF 1.11 APIs in mnist_dist/mnist_spark : git clone --single-branch -b leewyang_update_examples https://github.com/yahoo/tensorflowonspark
sc = SparkContext(conf=conf).getOrCreate()
sc.addPyFile('tensorflowonspark/examples/mnist/tf/mnist_dist.py')

In [13]:
import mnist_dist

In [14]:
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')

In [15]:
# Verify training images
# Make sure you unzipped mnist.zip into mnist and ran the mnist_data_setup job via: eval spark-submit ${SPARK_OPTS} --verbose $(pwd)/tensorflowonspark/examples/mnist/mnist_data_setup.py --output mnist/csv2 --format csv2
train_images_files = "mnist/tfr/train"
print(subprocess.check_output(shlex.split('hdfs dfs -ls -R {}'.format(train_images_files))))

b'-rw-r--r--   3 nobody supergroup          0 2018-11-05 22:25 mnist/tfr/train/_SUCCESS\n-rw-r--r--   3 nobody supergroup    4865957 2018-11-05 22:25 mnist/tfr/train/part-r-00000\n-rw-r--r--   3 nobody supergroup    5853920 2018-11-05 22:25 mnist/tfr/train/part-r-00001\n-rw-r--r--   3 nobody supergroup    5844694 2018-11-05 22:25 mnist/tfr/train/part-r-00002\n-rw-r--r--   3 nobody supergroup    5851485 2018-11-05 22:25 mnist/tfr/train/part-r-00003\n-rw-r--r--   3 nobody supergroup    5844411 2018-11-05 22:25 mnist/tfr/train/part-r-00004\n-rw-r--r--   3 nobody supergroup    5824177 2018-11-05 22:25 mnist/tfr/train/part-r-00005\n-rw-r--r--   3 nobody supergroup    5843674 2018-11-05 22:25 mnist/tfr/train/part-r-00006\n-rw-r--r--   3 nobody supergroup    5835612 2018-11-05 22:25 mnist/tfr/train/part-r-00007\n-rw-r--r--   3 nobody supergroup    5833012 2018-11-05 22:25 mnist/tfr/train/part-r-00008\n-rw-r--r--   3 nobody supergroup    5444489 2018-11-05 22:25 mnist/tfr/train/part-r-00009\n'

In [16]:
# Parse arguments for training
args = parser.parse_args(['--mode', 'train', '--epochs', '3',
                          '--batch_size', '100',
                          '--images_labels', train_images_files,
                          '--format', 'tfr',
                          '--steps', '10000',
                          '--model', 'mnist_model'])
print(args)

Namespace(batch_size=100, cluster_size=5, driver_ps_nodes=False, epochs=3, format='tfr', images_labels='mnist/tfr/train', mode='train', model='mnist_model', num_ps=0, output='predictions', rdma=False, readers=10, shuffle_size=1000, steps=10000, tensorboard=False)


In [17]:
# Start the cluster for training
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, args.num_ps, False, TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes)

10:39:54 INFO:Reserving TFSparkNodes 
10:39:54 INFO:cluster_template: {'ps': range(0, 0), 'worker': range(0, 5)}
10:39:54 INFO:listening for reservations at ('9.0.7.2', 35579)
10:39:54 INFO:Starting TensorFlow on executors
10:39:55 INFO:Waiting for TFSparkNodes to start
10:39:55 INFO:waiting for 5 reservations
10:39:56 INFO:waiting for 5 reservations
10:39:57 INFO:waiting for 5 reservations
10:39:58 INFO:waiting for 5 reservations
10:39:59 INFO:waiting for 4 reservations
10:40:00 INFO:all reservations completed
10:40:00 INFO:All TFSparkNodes started
10:40:00 INFO:{'executor_id': 2, 'host': '10.0.4.228', 'job_name': 'worker', 'task_index': 2, 'port': 35825, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-c01_wacm/listener-99upz2mo', 'authkey': b'\xbc\xce`n\xdboH\x91\xbd/<\xfa[\xfb\x15r'}
10:40:00 INFO:{'executor_id': 4, 'host': '10.0.7.225', 'job_name': 'worker', 'task_index': 4, 'port': 46340, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-afgx4tuj/listener-_al9izeg', 'authkey': b'\xe7W\x

In [18]:
cluster.shutdown()

10:41:24 INFO:Stopping TensorFlow nodes
10:41:24 INFO:Shutting down cluster


In [19]:
# See if mnist_model was successfully created
print(subprocess.check_output(shlex.split('hdfs dfs -ls mnist_model')))

b'Found 10 items\n-rw-r--r--   3 nobody supergroup        128 2018-11-05 22:40 mnist_model/checkpoint\n-rw-r--r--   3 nobody supergroup         40 2018-11-05 22:40 mnist_model/events.out.tfevents.1541457603.ip-10-0-5-225.us-west-2.compute.internal\n-rw-r--r--   3 nobody supergroup     179586 2018-11-05 22:40 mnist_model/graph.pbtxt\n-rw-r--r--   3 nobody supergroup     814168 2018-11-05 22:40 mnist_model/model.ckpt-0.data-00000-of-00001\n-rw-r--r--   3 nobody supergroup        375 2018-11-05 22:40 mnist_model/model.ckpt-0.index\n-rw-r--r--   3 nobody supergroup      66439 2018-11-05 22:40 mnist_model/model.ckpt-0.meta\n-rw-r--r--   3 nobody supergroup     814168 2018-11-05 22:40 mnist_model/model.ckpt-354.data-00000-of-00001\n-rw-r--r--   3 nobody supergroup        375 2018-11-05 22:40 mnist_model/model.ckpt-354.index\n-rw-r--r--   3 nobody supergroup      66439 2018-11-05 22:40 mnist_model/model.ckpt-354.meta\ndrwxr-xr-x   - nobody supergroup          0 2018-11-05 22:40 mnist_model/tr

In [20]:
# Parse arguments for inference
args = parser.parse_args(['--mode', 'inference', '--epochs', '3',
                          '--batch_size', '100',
                          '--images_labels', train_images_files,
                          '--format', 'tfr',
                          '--steps', '10000',
                          '--output', 'predictions',
                          '--model', 'mnist_model'])
print(args)

Namespace(batch_size=100, cluster_size=5, driver_ps_nodes=False, epochs=3, format='tfr', images_labels='mnist/tfr/train', mode='inference', model='mnist_model', num_ps=0, output='predictions', rdma=False, readers=10, shuffle_size=1000, steps=10000, tensorboard=False)


In [21]:
# Start the cluster for inference
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, args.num_ps, False, TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes)

10:41:48 INFO:Reserving TFSparkNodes 
10:41:48 INFO:cluster_template: {'ps': range(0, 0), 'worker': range(0, 5)}
10:41:48 INFO:listening for reservations at ('9.0.7.2', 35357)
10:41:48 INFO:Starting TensorFlow on executors
10:41:48 INFO:Waiting for TFSparkNodes to start
10:41:48 INFO:waiting for 5 reservations
10:41:49 INFO:waiting for 5 reservations
10:41:50 INFO:all reservations completed
10:41:50 INFO:All TFSparkNodes started
10:41:50 INFO:{'executor_id': 0, 'host': '10.0.4.228', 'job_name': 'worker', 'task_index': 0, 'port': 40124, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-0gzba7qn/listener-52gcid_c', 'authkey': b'-Tic\xeb\xefH\xa2\x97\x9c\x1bTs\xbd5\xe8'}
10:41:50 INFO:{'executor_id': 4, 'host': '10.0.7.225', 'job_name': 'worker', 'task_index': 4, 'port': 39938, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-_45qn21g/listener-soc8dl60', 'authkey': b'\x08\xe2)\xc6\x13:Gp\x9b1\xd2\x8c|\x17\xf1m'}
10:41:50 INFO:{'executor_id': 2, 'host': '10.0.4.116', 'job_name': 'worker', 'task_i

In [22]:
predictions = sc.textFile("predictions")

In [23]:
predictions.take(10)

['0 0', '6 6', '4 4', '4 4', '4 4', '9 9', '2 2', '3 3', '8 8', '8 8']

In [24]:
cluster.shutdown()

10:44:04 INFO:Stopping TensorFlow nodes
10:44:04 INFO:Shutting down cluster


In [25]:
sc.stop()