In [1]:
import tensorflow
from pysparkling import H2OContext

In [2]:
## Demo-Specific configuration
NODES=3 # Number of partitions to split data into (~simulate a number of nodes)
DATASET_DIR="/users/nidhimehta/h2o-3/bigdata/laptop/mnist"

In [3]:
## Initialize TensorFlow session and test it
def map_fun(i):
  import tensorflow as tf
  with tf.Graph().as_default() as g:
    hello = tf.constant('Sparkling, TensorFlow!', name="hello_constant")
    with tf.Session() as sess:
      return sess.run(hello)
sc.parallelize(range(NODES), NODES).map(map_fun).collect()

['Sparkling, TensorFlow!', 'Sparkling, TensorFlow!', 'Sparkling, TensorFlow!']

In [4]:
## Read MNIST data into H2O
import h2o
h2o.__version__
hc = H2OContext(sc).start()
train_frame = h2o.import_file("{}/{}".format(DATASET_DIR, "train.csv.gz"))
test_frame = h2o.import_file("{}/{}".format(DATASET_DIR, "test.csv.gz"))

0,1
H2O cluster uptime:,6 seconds 229 milliseconds
H2O cluster version:,3.8.2.2
H2O cluster name:,sparkling-water-nidhimehta_1121600612
H2O cluster total nodes:,3
H2O cluster total free memory:,2.88 GB
H2O cluster total cores:,24
H2O cluster allowed cores:,24
H2O cluster healthy:,True
H2O Connection ip:,192.168.1.100
H2O Connection port:,54329



Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%


In [5]:
## Turn H2O DataFrame into a Spark DataFrame
train_df = hc.as_spark_frame(train_frame).repartition(NODES)
test_df = hc.as_spark_frame(test_frame).repartition(NODES)
#train_df.printSchema()

In [6]:
# Configure a TensorFlow Deep Learning model

# - it loads local training data into numpy array (from Spark -> Python)
# - train TF Deep Learning model with 1 hidden layer (50 neurons)
# - output accuracy on training data
def create_nn(data_train, data_test, iterations, batch_size):
    import tensorflow as tf
    # Symbolic computation
    x = tf.placeholder(tf.float32, [None, 784])
    W1 = tf.Variable(tf.random_normal([784, 50],stddev=0.1))
    W2 = tf.Variable(tf.random_normal([50, 10],stddev=0.1))
    b1 = tf.Variable(tf.random_normal([50],stddev=0.1))
    b2 = tf.Variable(tf.random_normal([10],stddev=0.1))
    hidden = tf.nn.relu(tf.matmul(x, W1) + b1)
    y = tf.nn.softmax(tf.matmul(hidden, W2) + b2)
    y_ = tf.placeholder(tf.float32, [None, 10])
    cross_entropy = -tf.reduce_sum(y_*tf.log(y))                    
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
    
    # Initialize TF
    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    print("Training TensorFlow Deep Learning model")
    for i in range(iterations):
      #print("TensorFlow iter: ", i, " session: ", sess)
      batch_xs, batch_ys = data_train.next_batch(batch_size)
      sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
        
    model = [(sess.run(W1),sess.run(W2),sess.run(b1),sess.run(b2))]

    # Model evaluation
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    batch_xs, batch_ys = data_test.next_batch(1000)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print("Training Accuracy:", sess.run(accuracy, feed_dict={x: batch_xs, y_: batch_ys}))
    #print(sess.run(tf.argmax(y,1), feed_dict={x: batch_xs, y_: batch_ys}))
    
    sess.close()
    return iter(model)
    
    # Export the model
    #from tensorflow_serving.session_bundle import exporter
    #export_path = "/tmp/xxx/"
    #saver = tf.train.Saver(sharded=True)
    #model_exporter = exporter.Exporter(saver)
    #signature = exporter.classification_signature(input_tensor=x, scores_tensor=y)
    #model_exporter.init(sess.graph.as_graph_def(), default_graph_signature=signature)
    #model_exporter.export(export_path, tf.constant(FLAGS.export_version), sess)
    
## Internal Helpers

# Sampling with replacement to provide a batch size
# Load everything into numpy datastructure
import numpy as np

def expand1hot(response, levels):
    nrows = response.shape[0]
    result = np.zeros((nrows, levels), dtype=np.float32)
    result[np.arange(nrows), response.astype(np.int8)] = 1.0
    return result

class RowData:
    def __init__(self, it):
        self._part_array = np.array([ [a for a in x] for x in it], dtype=np.float32)
        # Definition of input features
        self._x = range(784)
        # Index of response
        self._y = 784

    def next_batch(self, n):
        # Sample from local data without replacement
        dim = self._part_array.shape[0] # number of rows
        sample = np.random.choice(dim, n, replace=False)
        data = self._part_array[sample, :]
        # Data comming from H2O, pixel values are 0..255
        # FIXME: this should be done via RDD or H2O API directly !
        train = data[:, self._x]/255 
        response = expand1hot(data[:, self._y], 10)
        return (train, response)



In [7]:
## Run TensorFlow on each node

# Number of iterations
ITERATIONS = 100
# Batch size
BATCH_SIZE = 100
# Use Mnist dataset provided by TensorFlow - for debugging only
USE_TF_MNIST=False

def train_nn(iterations, batch_size, use_tf_mnist=False):
    def perPartition(it):
        if not use_tf_mnist:
            train_data = RowData(it)
            test_data = train_data
        else:
            from tensorflow.examples.tutorials.mnist import input_data
            mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
            train_data = mnist.train
            test_data = mnist.train
            
        return create_nn(train_data, test_data, iterations, batch_size)
        
    return perPartition

modelsPerNode = train_df.mapPartitions(train_nn(ITERATIONS, BATCH_SIZE, USE_TF_MNIST)).collect()

In [8]:
# Average the weights and biases across all node-local models
W1 = modelsPerNode[0][0]
W2 = modelsPerNode[0][1]
b1 = modelsPerNode[0][2]
b2 = modelsPerNode[0][3]

AVERAGE = True

if (AVERAGE):
  for i in range(1,NODES):
    W1 = W1 + modelsPerNode[i][0]
    W2 = W2 + modelsPerNode[i][1]
    b1 = b1 + modelsPerNode[i][2]
    b2 = b2 + modelsPerNode[i][3]

W1 = W1/NODES
W2 = W2/NODES
b1 = b1/NODES
b2 = b2/NODES

#print(W1)
#print(W2)
print(b1)
print(b2)

[-0.07291194 -0.02839035 -0.07471841 -0.04402993 -0.02235463 -0.02620575
  0.1163722  -0.10408342  0.03329315  0.02185328 -0.01488095 -0.04401837
  0.01081673 -0.08093681  0.00062102  0.18718861  0.10789666 -0.03121319
  0.01584251  0.02999378  0.00920764  0.01968472 -0.22378016 -0.00778377
 -0.03973616  0.03197502 -0.0680975   0.04697173  0.03519468 -0.00212386
  0.02742671 -0.14775626 -0.02035625  0.08105865  0.00861828 -0.00487186
 -0.08079082  0.02739912  0.00801532 -0.10542602  0.12667958 -0.03169039
  0.03707669 -0.14269291 -0.08280306 -0.00424655 -0.03680487 -0.03225352
  0.09846754 -0.09231455]
[-0.31967929 -0.20698647 -0.16512461  0.15594698  0.11046103  0.28740093
 -0.05201261 -0.25301719  0.40938249  0.17546685]


In [9]:
W1 = np.transpose(W1)
W2 = np.transpose(W2)
b1 = np.matrix(b1)
b1 = np.transpose(b1)
b2 = np.matrix(b2)
b2 = np.transpose(b2)

In [10]:
#Initialize an H2O Model with those weights/biases
#from pysparkling import *
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

## Create an H2O Deep Learning model from the TensorFlow model
dlmodel = H2ODeepLearningEstimator(
    hidden=[50],             ## same Network layout as TF - one hidden layer
    epochs=0,                ## no training done in H2O - just copy over the model from TF
    ignore_const_cols=False  ## keep all input features (unless we also drop const cols in TF)
    
    ### Initialize the H2O model with the TensorFlow model state
    ### Requires H2O 3.8.2.1 or later
    ,initial_weights=[h2o.H2OFrame(W1.tolist()),h2o.H2OFrame(W2.tolist())]
    ,initial_biases=[h2o.H2OFrame(b1.tolist()),h2o.H2OFrame(b2.tolist())]
)
train_frame[784] = train_frame[784].asfactor()
dlmodel.train(x=list(range(784)),y=784,training_frame=train_frame)


Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%

deeplearning Model Build Progress: [                                                  ] 00%


In [11]:
## Check the model performance - Will only be good if we actually copied the TF model into the H2O model above
dlmodel.model_performance(train_frame)


ModelMetricsMultinomial: deeplearning
** Reported on test data. **

MSE: 0.234191312262
R^2: 0.971945558294
LogLoss: 0.88625787961

Confusion Matrix: vertical: actual; across: predicted



0,1,2,3,4,5,6,7,8,9,10,11
0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,Error,Rate
4886.0,3.0,137.0,72.0,32.0,442.0,163.0,37.0,3.0,148.0,0.1750802,"1,037 / 5,923"
2.0,2233.0,1115.0,805.0,5.0,1576.0,2.0,992.0,1.0,11.0,0.6687926,"4,509 / 6,742"
18.0,9.0,5289.0,130.0,126.0,100.0,96.0,124.0,8.0,58.0,0.1122860,"669 / 5,958"
14.0,16.0,217.0,4868.0,16.0,774.0,20.0,98.0,3.0,105.0,0.2060023,"1,263 / 6,131"
8.0,2.0,38.0,10.0,4909.0,119.0,42.0,19.0,0.0,695.0,0.1597056,"933 / 5,842"
15.0,4.0,83.0,252.0,74.0,4785.0,55.0,25.0,2.0,126.0,0.1173215,"636 / 5,421"
30.0,9.0,128.0,4.0,379.0,326.0,4983.0,35.0,2.0,22.0,0.1579926,"935 / 5,918"
1.0,2.0,144.0,99.0,80.0,74.0,2.0,5055.0,0.0,808.0,0.1931365,"1,210 / 6,265"
10.0,125.0,387.0,383.0,203.0,1898.0,151.0,66.0,1609.0,1019.0,0.7250043,"4,242 / 5,851"



Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.73155
2,0.8636333
3,0.9170666
4,0.9535166
5,0.9754833
6,0.9907500
7,0.9953833
8,0.9980333
9,0.9993666




In [None]:
## Now we have a POJO for the TensorFlow model!
dlmodel.download_pojo()