In [1]:
# #use data_team_concurrent iam
# dbutils.fs.cp("s3://ehth-databricks-poc-test/ehealth/JingP/cora/cora.cites","dbfs:/tmp/cora/cora.cites")
# dbutils.fs.cp("s3://ehth-databricks-poc-test/ehealth/JingP/cora/cora.content","dbfs:/tmp/cora/cora.content")

In [2]:
# #then use horovod cluster
# dbutils.fs.cp("dbfs:/tmp/cora/cora.cites","file:/tmp/cora/cora.cites")
# dbutils.fs.cp("dbfs:/tmp/cora/cora.content","file:/tmp/cora/cora.content")

In [3]:
%run ./utils

In [4]:
%run ./layers/graph

In [5]:
import os
import time

checkpoint_dir = '/dbfs/kegra_horovod/train/{}/'.format(time.time())
os.makedirs(checkpoint_dir)

In [6]:
from __future__ import print_function

from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2

#databricks import functions in the library by %run
#from kegra.layers.graph import GraphConvolution
#from kegra.utils import *

import time

# Horovod: Import the relevant submodule
import horovod.keras as hvd
from keras import backend as K
import tensorflow as tf
import keras

In [7]:
# Define parameters
DATASET = 'cora'
PATH='/dbfs/tmp/cora/'
FILTER = 'localpool'  # 'chebyshev'
MAX_DEGREE = 2  # maximum polynomial degree
SYM_NORM = True  # symmetric (True) vs. left-only (False) normalization
NB_EPOCH = 200
PATIENCE = 10  # early stopping patience
LEARNING_RATE=0.01

In [8]:
%sh ls /tmp/cora

In [9]:
def train_hvd(learning_rate=LEARNING_RATE):
  # Horovod: initialize Horovod.
  hvd.init()
  
  # Get data
  X, A, y = load_data(path=PATH,dataset=DATASET)
  y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y)

  # Normalize X
  X /= X.sum(1).reshape(-1, 1)
  
  if FILTER == 'localpool':
    """ Local pooling filters (see 'renormalization trick' in Kipf & Welling, arXiv 2016) """
    print('Using local pooling filters...')
    A_ = preprocess_adj(A, SYM_NORM)
    support = 1
    graph = [X, A_]
    G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]

  elif FILTER == 'chebyshev':
    """ Chebyshev polynomial basis filters (Defferard et al., NIPS 2016)  """
    print('Using Chebyshev polynomial basis filters...')
    L = normalized_laplacian(A, SYM_NORM)
    L_scaled = rescale_laplacian(L)
    T_k = chebyshev_polynomial(L_scaled, MAX_DEGREE)
    support = MAX_DEGREE + 1
    graph = [X]+T_k
    G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True) for _ in range(support)]

  else:
      raise Exception('Invalid filter type.')

  X_in = Input(shape=(X.shape[1],))
  
  print("X_in shape is {}".format(X_in.shape))
   
  # Define model architecture
  # NOTE: We pass arguments for graph convolutional layers as a list of tensors.
  # This is somewhat hacky, more elegant options would require rewriting the Layer base class.
  H = Dropout(0.5)(X_in)
  H = GraphConvolution(16, support, activation='relu', kernel_regularizer=l2(5e-4))([H]+G)
  H = Dropout(0.5)(H)
  Y = GraphConvolution(y.shape[1], support, activation='softmax')([H]+G)

  # Compile model
  model = Model(inputs=[X_in]+G, outputs=Y)
  model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learning_rate))
  
  print ("model compile is done")
  
  callbacks = [
      # Horovod: broadcast initial variable states from rank 0 to all other processes.
      # This is necessary to ensure consistent initialization of all workers when
      # training is started with random weights or restored from a checkpoint.
      hvd.callbacks.BroadcastGlobalVariablesCallback(0),
  ]

  # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
  if hvd.rank() == 0:
      callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.ckpt', save_weights_only = True,period=100))
      
  
  print ("hvd call back done")
  
  

  # Helper variables for main training loop
  wait = 0
  preds = None
  best_val_loss = 99999

  # Fit

  # Log wall-clock time
 
  # Single training iteration (we mask nodes without labels for loss calculation)
  model.fit(graph, y_train, sample_weight=train_mask,\
            batch_size=A.shape[0], epochs=NB_EPOCH, verbose=2, shuffle=False)
        

t = time.time()  
train_hvd()
print ("time cost is {}".format(time.time()-t))
#print(X_in.shape)

In [10]:
from sparkdl import HorovodRunner
t = time.time()
hr = HorovodRunner(np=8)
model = hr.run(train_hvd, learning_rate=LEARNING_RATE)
print ("time cost is {}".format(time.time()-t))

In [11]:
X, A, y = load_data(path=PATH,dataset=DATASET)

In [12]:
#more about pickle
#https://stackoverflow.com/questions/44144584/typeerror-cant-pickle-thread-lock-objects
#https://github.com/keras-team/keras/issues/8343
#https://github.com/keras-team/keras/issues/10528
#https://bugs.python.org/issue29168
#https://www.reddit.com/r/learnpython/comments/bl2vze/what_does_the_error_message_typeerror_cant_pickle/
#https://www.reddit.com/r/learnpython/comments/brbplc/cant_pickle_threadrlock_objects/
#https://www.bountysource.com/issues/66242430-typeerror-can-t-pickle-_thread-rlock-objects-while-saving-the-keras-model-using-model-save
#https://devrant.com/rants/1279739/typeerror-cant-pickle-thread-lock-objects-someone-please-guide-me-pick-dump-mode
#https://bugzilla.redhat.com/show_bug.cgi?id=1444983

In [13]:
# Testing
test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test])
print("Test set results:",
      "loss= {:.4f}".format(test_loss[0]),
      "accuracy= {:.4f}".format(test_acc[0]))