#  Display options:

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


In [None]:

#  When using UDFs, these execute on another host.
#
#  As such, these methods will need to be copied and run locally also.
#
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)


#  Setup stuff: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


#  UDFs, Part 01: Just MPI

In [None]:

#  Best practice; Have a second NoteBook open with the Operations Widget ready-
#
#  Why ?
#     When you make mistakes below, you will want to kill the Python job that is running
#     on the worker nodes, the UDFs you submit below-
#
#  Source,
#     https://mpi4py.readthedocs.io/en/stable/tutorial.html


In [None]:

#  Example for testing; generates a random dataset
#

def my_func():
    
   #  from katana_enterprise.distributed import single_host
   from katana.distributed import single_host
      #
   import numpy as np                                                     #  Unparitioned objects are just fine; we are working from only
                                                                          #  one node

   l_return = np.random.randint(1, 101, 4)                                #  Generate an array[4] of random numbers in the range of 1-100
      #                                                                   #     <class 'numpy.ndarray'>
   print(l_return)
    
    
   #  return l_return
   #
   #     ValueError: Hosts returned more than one value. Please have only one host return a value while other's return a None.
   #     The returned results (index is the host # in list):
   #     [array([71, 25, 37, 40]), array([72, 19, 18, 63]), array([34, 39, 44, 59])]


   return single_host(host=0, result=l_return)                            #  Multiple worker nodes (hosts), can only return from 1


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display("My return: %s" % (str(my_return)))


#  Sample output,
#
#     Host 0 output:
#     [80 68 81 23]
#     
#     Host 1 output:
#     [61 17 47 34]
#     
#     Host 2 output:
#     [11 93 11 57]
#     
#     'My return: [80 68 81 23]'


In [None]:

#  Knowing which worker node your code is running on; this approach is preferred
#

def my_func():
    
   from katana.distributed import MPI

   l_comm         = MPI.COMM_WORLD                                        #  Think of this as a connection handle to this subsystem
      #
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())
      #
   print("This node number: %d   Count of all Nodes: %d" % (l_nodenumber, l_nodecount))

   return


l_result = my_graph.run(lambda g: my_func())
   #
print("--")

#  Sample output,
#
#     Host 0 output:
#     This node number: 0   Count of all Nodes: 3
#     
#     Host 1 output:
#     This node number: 1   Count of all Nodes: 3
#     
#     Host 2 output:
#     This node number: 2   Count of all Nodes: 3


In [None]:

#  Example using inter worker node data passing; approach 1
#

def my_func():
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())
      #
   l_return       = None

    
   if (l_nodenumber == 0):                                                #  Just node-0, just receive
      l_datarecv = l_comm.recv(source = 1)
      print(l_datarecv)
      l_return = l_datarecv
   elif (l_nodenumber == 1):                                              #  Just node-1, just send
      l_datasend = np.random.randint(1, 101, 4)
      l_comm.send(l_datasend, dest = 0)
    
   return single_host(host=0, result=l_return)


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display(my_return)


#  Sample output,
#
#     Host 0 output:
#     [57  2 16 73]
#     
#     array([57,  2, 16, 73])


In [None]:

#  Example using inter worker node data passing; approach 2
#
#  Change,
#
#     .  Do work on all nodes
#
#     **  This approach will hang or similar as written-
#

def my_func():
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())


   def f_dowork():                                                        

      #
      #  Do whatever here to generate what you want to return to the calling
      #  function/client
      #
    
      l_result = np.random.randint(1, 101, 4)
         #
      return l_result
    
    
   #  Once this works, this function should not need to change
   #
   def f_datasend(i_datasend):
      if (l_nodenumber > 0):                                               #  Generally; it seems to hang if a node tries to
         l_comm.isend(i_datasend, dest = 0)                                #  send/recv to itself.
         print("SENT")

    
   #  Once this works, this function should not need to change
   #
   def f_datarecv(i_result):
    
      l_return = []
         #
      if (l_nodenumber == 0):
         l_return.append(i_result)

        
      #  if l_nodenumber == 0:                                            #  This block hangs every time.
      #     l_datarecv = l_comm.recv(source = 1)                          #
      #     print(l_datarecv)                                             #  Change the order to be 2, then 1, and it works
      #     l_return.append(l_datarecv)                                   #  frequently, not always
      #        #
      #     l_datarecv = l_comm.recv(source = 2)
      #     print(l_datarecv)
      #     l_return.append(l_datarecv)
        
      if l_nodenumber == 0:                                               #  This block works frequently , because of the order;
         l_datarecv = l_comm.recv(source = 2)                             #  2 then 1
         print("RECV")
         l_return.append(l_datarecv)
            #
         l_datarecv = l_comm.recv(source = 1)
         print("RECV")
         l_return.append(l_datarecv)
    
    
      return l_return
    
    
            #####################################################
        

   l_result    = f_dowork()
      #
   f_datasend(l_result)
      #
   l_returnall = f_datarecv(l_result)


   return single_host(host=0, result=l_returnall)


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display(my_return)


#  Error,
#
#      CancelledError: 

#  Output, when it works,
#
#     Host 0 output:
#     RECV
#     RECV
#     
#     Host 1 output:
#     SENT
#     
#     Host 2 output:
#     SENT
#     
#     [array([13, 77, 17, 87]), array([ 3, 86, 14, 37]), array([70,  4, 17, 93])]


In [None]:

#  Hacking version; trying async


def my_func():
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())


   def f_dowork():                                                        

      #
      #  Do whatever here to generate what you want to return to the calling
      #  function/client
      #
    
      l_result = np.random.randint(1, 101, 4)
         #
      return l_result
    
    
   #  Once this works, this function should not need to change
   #
   def f_datasend(i_datasend):
      if (l_nodenumber > 0):                                               #  Generally; it seems to hang if a node tries to
         #  l_comm.send(i_datasend, dest = 0, tag = l_nodenumber)             #  send/recv to itself.
         l_comm.send(1, dest = 0, tag = l_nodenumber) 
         print("SENT")

    
   #  Once this works, this function should not need to change
   #
   def f_datarecv(i_result):
    
      l_return = []
         #
      if (l_nodenumber == 0):
         l_return.append(i_result)

        
      if l_nodenumber == 0:
         l_datarecv = l_comm.recv(source = 1, tag = 2) 
         #  l_req = l_datarecv = l_comm.irecv(source = 1, tag = 2) 
         #  aaa = l_req.wait()
         print("RECV")
         l_return.append(l_datarecv)
            #
         l_datarecv = l_comm.recv(source = 2, tag = 1)
         #  l_req = l_datarecv = l_comm.irecv(source = 2, tag = 1)
         #  aaa = l_req.wait()
         print("RECV")
         l_return.append(l_datarecv)
        
    
      return l_return
    
    
            #####################################################
        

   l_result    = f_dowork()
      #
   f_datasend(l_result)
      #
   l_returnall = f_datarecv(l_result)


   return single_host(host=0, result=l_returnall)


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display(my_return)





In [None]:

import numpy as np
from mpi4py import MPI
from mpi4py.util import dtlib

comm = MPI.COMM_WORLD
rank = comm.Get_rank()

datatype = MPI.FLOAT
np_dtype = dtlib.to_numpy_dtype(datatype)
itemsize = datatype.Get_size()

N = 10
win_size = N * itemsize if rank == 0 else 0
win = MPI.Win.Allocate(win_size, comm=comm)

buf = np.empty(N, dtype=np_dtype)
if rank == 0:
    buf.fill(42)
    win.Lock(rank=0)
    win.Put(buf, target_rank=0)
    win.Unlock(rank=0)
    comm.Barrier()
else:
    comm.Barrier()
    win.Lock(rank=0)
    win.Get(buf, target_rank=0)
    win.Unlock(rank=0)
    assert np.all(buf == 42)
    print(str(buf))

print("--")


In [None]:

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

sendbuf = None
if rank == 0:
    sendbuf = np.empty([size, 100], dtype='i')
    sendbuf.T[:,:] = range(size)
recvbuf = np.empty(100, dtype='i')
comm.Scatter(sendbuf, recvbuf, root=0)
assert np.allclose(recvbuf, rank)

print("--")


In [None]:

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

sendbuf = np.zeros(100, dtype='i') + rank
recvbuf = None
if rank == 0:
    recvbuf = np.empty([size, 100], dtype='i')
comm.Gather(sendbuf, recvbuf, root=0)
if rank == 0:
    for i in range(size):
        #  assert np.allclose(recvbuf[i,:], i)
        print(recvbuf[i,:], i)
        

In [None]:

#  Another example; building a DataFrame which gets returned
#

def my_func(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   ##################################################################

   #  Nested function, load a Python DataFrame from the KG graph
    
   def l_load_df(i_graph, i_properties):
      l_nodes = []
    
      for l_property in i_properties:
         l_node = i_graph.nodes.get_property(l_property).to_numpy().reshape(-1, 1)
         l_nodes.append(l_node)
       
      l_array     = np.hstack(l_nodes)
      l_dataframe = pd.DataFrame(l_array, columns=i_properties)
         #
    
      return l_dataframe
    
   ##################################################################

   #  Nested function, add a new, derived column to the DataFrame
    
   def l_enrich_df(i_dataframe, i_source_property, i_new_property):
    
      def l_to_upper(i_str):
         if i_str.get(i_source_property) is not None:
            return str(i_str[i_source_property]).upper()
         else:
            return
    
      i_dataframe[i_new_property] = i_dataframe.apply(l_to_upper, axis=1)
    
      return

   ##################################################################
    
    
   l_dataframe = l_load_df(i_graph, i_properties)
      #
   l_enrich_df(l_dataframe, "airport_name", "airport_name_upper")

        
   return single_host(host=0, result=l_dataframe)
   #  return l_dataframe


l_result = my_graph.run(lambda g: my_func(g, ["id", "airport_name", "restaurant_name"]))
   #
print(l_result)


#  Sample output,
#
#         id    airport_name              restaurant_name airport_name_upper
#     0  SJC        San Jose                         None           SAN JOSE
#     1  DEN          Denver                         None             DENVER
#     2  ORD  Chicago O-Hare                         None     CHICAGO O-HARE
#     3  PAP            None  Pappadeauxs Seafood Kitchen               None
