In [13]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging
from datetime import datetime
import argparse
import subprocess
import shlex
from imp import reload
from tensorflowonspark import TFCluster
from pyspark import SparkContext
from pyspark.conf import SparkConf

In [2]:
# Remove existing models/artifacts if any
subprocess.check_output('hdfs dfs -rm -R -f -skipTrash mnist', shell=True)
subprocess.check_output('hdfs dfs -rm -R -f -skipTrash mnist_model', shell=True)
subprocess.check_output('hdfs dfs -rm -R -f -skipTrash predictions', shell=True)
subprocess.check_output('rm -Rf mnist tensorflowonspark', shell=True)
subprocess.check_output('rm -f mnist.zip', shell=True)

b''

In [3]:
# Clone the repo with the adjusted TF 1.11 APIs in mnist_dist/mnist_spark
subprocess.check_output('git clone --single-branch -b leewyang_update_examples https://github.com/yahoo/tensorflowonspark', shell=True)

b''

In [5]:
# Download the mnist example 
subprocess.check_output('curl -fsSL -O https://downloads.mesosphere.com/data-science/assets/mnist.zip && unzip mnist.zip', shell=True)

b'Archive:  mnist.zip\n   creating: mnist/\n  inflating: mnist/train-images-idx3-ubyte.gz  \n extracting: mnist/train-labels-idx1-ubyte.gz  \n  inflating: mnist/t10k-images-idx3-ubyte.gz  \n extracting: mnist/t10k-labels-idx1-ubyte.gz  \n'

In [6]:
# Create mnist data in csv2 format
subprocess.check_output('eval spark-submit ${SPARK_OPTS} --verbose $(pwd)/tensorflowonspark/examples/mnist/mnist_data_setup.py --output mnist/csv2 --format csv2', shell=True)



In [7]:
num_ps = 0
num_executors = 5

parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100)
parser.add_argument("--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors)
parser.add_argument("--driver_ps_nodes", help="""run tensorflow PS node on driver locally.
    You will need to set cluster_size = num_executors + num_ps""", default=False)
parser.add_argument("--epochs", help="number of epochs", type=int, default=1)
parser.add_argument("--format", help="example format: (csv2|tfr)", choices=["csv2", "tfr"], default="tfr")
parser.add_argument("--images_labels", help="HDFS path to MNIST image_label files in parallelized format")
parser.add_argument("--mode", help="train|inference", default="train")
parser.add_argument("--model", help="HDFS path to save/load model during train/test", default="mnist_model")
parser.add_argument("--num_ps", help="number of ps nodes", default=num_ps)
parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("--rdma", help="use rdma connection", default=False)
parser.add_argument("--readers", help="number of reader/enqueue threads per worker", type=int, default=10)
parser.add_argument("--shuffle_size", help="size of shuffle buffer", type=int, default=1000)
parser.add_argument("--steps", help="maximum number of steps", type=int, default=500)
parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")

_StoreTrueAction(option_strings=['--tensorboard'], dest='tensorboard', nargs=0, const=True, default=False, type=None, choices=None, help='launch tensorboard process', metavar=None)

In [None]:
# # CPU Config
# conf = SparkConf().setAppName('Mnist-CPU') \
#                   .set('spark.mesos.executor.docker.image', 'fabianbaier/data-toolkit:latest-gpu') 


In [8]:
# GPU Config
conf = SparkConf().setAppName('Mnist-GPU') \
                 .set('spark.mesos.executor.docker.image', 'mesosphere/mesosphere-data-toolkit:latest-gpu') \
                 .set('spark.mesos.gpus.max', num_executors) \
                 .set('spark.mesos.executor.gpus', 1)

In [9]:
# Make sure you cloned the repo with the adjusted TF 1.11 APIs in mnist_dist/mnist_spark : git clone --single-branch -b leewyang_update_examples https://github.com/yahoo/tensorflowonspark
sc = SparkContext(conf=conf).getOrCreate()
sc.addPyFile('tensorflowonspark/examples/mnist/tf/mnist_dist.py')

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.6/site-packages/sparkmonitor/kernelextension.py", line 117, in run
    self.onrecv(msg)
  File "/opt/conda/lib/python3.6/site-packages/sparkmonitor/kernelextension.py", line 134, in onrecv
    "msg": msg
  File "/opt/conda/lib/python3.6/site-packages/sparkmonitor/kernelextension.py", line 214, in sendToFrontEnd
    monitor.send(msg)
  File "/opt/conda/lib/python3.6/site-packages/sparkmonitor/kernelextension.py", line 51, in send
    self.comm.send(msg)
AttributeError: 'ScalaMonitor' object has no attribute 'comm'



In [10]:
import mnist_dist

In [11]:
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')

In [14]:
# Verify training images
# Make sure you unzipped mnist.zip into mnist and ran the mnist_data_setup job via: eval spark-submit ${SPARK_OPTS} --verbose $(pwd)/tensorflowonspark/examples/mnist/mnist_data_setup.py --output mnist/csv2 --format csv2
train_images_files = "mnist/csv2/train"
print(subprocess.check_output(shlex.split('hdfs dfs -ls -R {}'.format(train_images_files))))

b'-rw-r--r--   3 nobody supergroup          0 2018-11-04 05:05 mnist/csv2/train/_SUCCESS\n-rw-r--r--   3 nobody supergroup    9348476 2018-11-04 05:05 mnist/csv2/train/part-00000\n-rw-r--r--   3 nobody supergroup   11244092 2018-11-04 05:05 mnist/csv2/train/part-00001\n-rw-r--r--   3 nobody supergroup   11227072 2018-11-04 05:05 mnist/csv2/train/part-00002\n-rw-r--r--   3 nobody supergroup   11238388 2018-11-04 05:05 mnist/csv2/train/part-00003\n-rw-r--r--   3 nobody supergroup   11225055 2018-11-04 05:05 mnist/csv2/train/part-00004\n-rw-r--r--   3 nobody supergroup   11186122 2018-11-04 05:05 mnist/csv2/train/part-00005\n-rw-r--r--   3 nobody supergroup   11226573 2018-11-04 05:05 mnist/csv2/train/part-00006\n-rw-r--r--   3 nobody supergroup   11213312 2018-11-04 05:05 mnist/csv2/train/part-00007\n-rw-r--r--   3 nobody supergroup   11206429 2018-11-04 05:05 mnist/csv2/train/part-00008\n-rw-r--r--   3 nobody supergroup   10460475 2018-11-04 05:05 mnist/csv2/train/part-00009\n'


In [15]:
# Parse arguments for training
args = parser.parse_args(['--mode', 'train', '--epochs', '1',
                          '--batch_size', '100',
                          '--images_labels', train_images_files,
                          '--format', 'csv2',
                          '--steps', '10000',
                          '--model', 'mnist_model'])
print(args)

Namespace(batch_size=100, cluster_size=5, driver_ps_nodes=False, epochs=1, format='csv2', images_labels='mnist/csv2/train', mode='train', model='mnist_model', num_ps=0, output='predictions', rdma=False, readers=10, shuffle_size=1000, steps=10000, tensorboard=False)


In [16]:
# Start the cluster for training
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, args.num_ps, False, TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes)

05:07:01 INFO:Reserving TFSparkNodes 
05:07:01 INFO:cluster_template: {'ps': range(0, 0), 'worker': range(0, 5)}
05:07:01 INFO:listening for reservations at ('9.0.12.2', 35255)
05:07:01 INFO:Starting TensorFlow on executors
05:07:01 INFO:Waiting for TFSparkNodes to start
05:07:01 INFO:waiting for 5 reservations
05:07:02 INFO:waiting for 5 reservations
05:07:03 INFO:waiting for 5 reservations
05:07:04 INFO:waiting for 5 reservations
05:07:05 INFO:waiting for 1 reservations
05:07:06 INFO:all reservations completed
05:07:06 INFO:All TFSparkNodes started
05:07:06 INFO:{'executor_id': 4, 'host': '10.0.6.118', 'job_name': 'worker', 'task_index': 4, 'port': 43123, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-591qn8pd/listener-z59lcb98', 'authkey': b'\xe3\xcb\xc1-Q\x80M!\xa7\xbe\xa3C\x95C\x0er'}
05:07:06 INFO:{'executor_id': 3, 'host': '10.0.5.40', 'job_name': 'worker', 'task_index': 3, 'port': 32770, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-1ncu13a7/listener-ro2yft2p', 'authkey': b'\x84

In [17]:
cluster.shutdown()

05:08:41 INFO:Stopping TensorFlow nodes
05:08:41 INFO:Shutting down cluster


In [18]:
# See if mnist_model was successfully created
print(subprocess.check_output(shlex.split('hdfs dfs -ls mnist_model')))

b'Found 10 items\n-rw-r--r--   3 nobody supergroup        128 2018-11-04 05:07 mnist_model/checkpoint\n-rw-r--r--   3 nobody supergroup         40 2018-11-04 05:07 mnist_model/events.out.tfevents.1541308031.ip-10-0-6-30.us-west-2.compute.internal\n-rw-r--r--   3 nobody supergroup     611832 2018-11-04 05:07 mnist_model/graph.pbtxt\n-rw-r--r--   3 nobody supergroup     814168 2018-11-04 05:07 mnist_model/model.ckpt-0.data-00000-of-00001\n-rw-r--r--   3 nobody supergroup        375 2018-11-04 05:07 mnist_model/model.ckpt-0.index\n-rw-r--r--   3 nobody supergroup     178697 2018-11-04 05:07 mnist_model/model.ckpt-0.meta\n-rw-r--r--   3 nobody supergroup     814168 2018-11-04 05:07 mnist_model/model.ckpt-120.data-00000-of-00001\n-rw-r--r--   3 nobody supergroup        375 2018-11-04 05:07 mnist_model/model.ckpt-120.index\n-rw-r--r--   3 nobody supergroup     178697 2018-11-04 05:07 mnist_model/model.ckpt-120.meta\ndrwxr-xr-x   - nobody supergroup          0 2018-11-04 05:07 mnist_model/tra

In [19]:
# Parse arguments for inference
args = parser.parse_args(['--mode', 'inference', '--epochs', '1',
                          '--batch_size', '100',
                          '--images_labels', train_images_files,
                          '--format', 'csv2',
                          '--steps', '10000',
                          '--output', 'predictions',
                          '--model', 'mnist_model'])
print(args)

Namespace(batch_size=100, cluster_size=5, driver_ps_nodes=False, epochs=1, format='csv2', images_labels='mnist/csv2/train', mode='inference', model='mnist_model', num_ps=0, output='predictions', rdma=False, readers=10, shuffle_size=1000, steps=10000, tensorboard=False)


In [20]:
# Start the cluster for inference
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, args.num_ps, False, TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes)

05:08:59 INFO:Reserving TFSparkNodes 
05:08:59 INFO:cluster_template: {'ps': range(0, 0), 'worker': range(0, 5)}
05:08:59 INFO:listening for reservations at ('9.0.12.2', 39571)
05:08:59 INFO:Starting TensorFlow on executors
05:08:59 INFO:Waiting for TFSparkNodes to start
05:08:59 INFO:waiting for 5 reservations
05:09:00 INFO:waiting for 5 reservations
05:09:01 INFO:all reservations completed
05:09:01 INFO:All TFSparkNodes started
05:09:01 INFO:{'executor_id': 0, 'host': '10.0.5.174', 'job_name': 'worker', 'task_index': 0, 'port': 41609, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-4bqua8ry/listener-91eahqme', 'authkey': b'\xb5g\xef+\x9e_E\xf4\x9ap\xc9M\xc5\x02\xe8\xc3'}
05:09:01 INFO:{'executor_id': 3, 'host': '10.0.6.118', 'job_name': 'worker', 'task_index': 3, 'port': 35141, 'tb_pid': 0, 'tb_port': 0, 'addr': '/tmp/pymp-77vqqli_/listener-p771rtqc', 'authkey': b'\xd8=\xf4G\x92\xfdD\xf3\x99\x17\xc1\xe1Y \xba\x1c'}
05:09:01 INFO:{'executor_id': 1, 'host': '10.0.5.175', 'job_name': 'wor

In [None]:
predictions = sc.textFile("predictions")

In [None]:
predictions.take(10)

In [None]:
cluster.shutdown()

In [None]:
sc.stop()