#  Display options:

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


In [None]:

#  When using UDFs, these execute on another host.
#
#  As such, these methods will need to be copied and run locally also.
#
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)


#  Setup stuff: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


#  UDFs, Part 01: Just MPI

In [None]:

#  Best practice; Have a second NoteBook open with the Operations Widget ready-
#
#  Why ?
#     When you make mistakes below, you will want to kill the Python job that is running
#     on the worker nodes, the UDFs you submit below-
#
#  Source,
#     https://mpi4py.readthedocs.io/en/stable/tutorial.html


In [None]:

#  Example for testing; generates a random dataset
#

def my_func():
    
   #  from katana_enterprise.distributed import single_host
   from katana.distributed import single_host
      #
   import numpy as np                                                     #  Unparitioned objects are just fine; we are working from only
                                                                          #  one node

   l_return = np.random.randint(1, 101, 4)                                #  Generate an array[4] of random numbers in the range of 1-100
      #                                                                   #     <class 'numpy.ndarray'>
   print(l_return)
    
    
   #  return l_return
   #
   #     ValueError: Hosts returned more than one value. Please have only one host return a value while other's return a None.
   #     The returned results (index is the host # in list):
   #     [array([71, 25, 37, 40]), array([72, 19, 18, 63]), array([34, 39, 44, 59])]


   return single_host(host=0, result=l_return)                            #  Multiple worker nodes (hosts), can only return from 1


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display("My return: %s" % (str(my_return)))


#  Sample output,
#
#     Host 0 output:
#     [80 68 81 23]
#     
#     Host 1 output:
#     [61 17 47 34]
#     
#     Host 2 output:
#     [11 93 11 57]
#     
#     'My return: [80 68 81 23]'


In [None]:

#  Knowing which worker node your code is running on; this approach is preferred
#

def my_func():
    
   from katana.distributed import MPI

   l_comm         = MPI.COMM_WORLD                                        #  Think of this as a connection handle to this subsystem
      #
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())
      #
   print("This node number: %d   Count of all Nodes: %d" % (l_nodenumber, l_nodecount))

   return


l_result = my_graph.run(lambda g: my_func())
   #
print("--")

#  Sample output,
#
#     Host 0 output:
#     This node number: 0   Count of all Nodes: 3
#     
#     Host 1 output:
#     This node number: 1   Count of all Nodes: 3
#     
#     Host 2 output:
#     This node number: 2   Count of all Nodes: 3


In [None]:

#  Example using single inter worker node data passing (what MPI labels, "Point
#  to Point Communication") ; approach 1
#

def my_func():
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())
      #
   l_return       = None

    
   if (l_nodenumber == 0):                                                #  Just node-0, just receive
      l_datarecv = l_comm.recv(source = 1)
      print(l_datarecv)
      l_return = l_datarecv
   elif (l_nodenumber == 1):                                              #  Just node-1, just send
      l_datasend = np.random.randint(1, 101, 4)
      l_comm.send(l_datasend, dest = 0)
    
   return single_host(host=0, result=l_return)


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display(my_return)


#  Sample output,
#
#     Host 0 output:
#     [57  2 16 73]
#     
#     array([57,  2, 16, 73])


In [None]:

#  Example using multiple inter worker node data passing; approach 2
#
#  Change,
#
#     .  Do work on all nodes
#
#     **  This approach will hang or similar as written-
#

#  def my_func():
#      
#     from katana.distributed import single_host
#     from katana.distributed import MPI
#        #
#     import numpy as np
#  
#  
#     l_comm         = MPI.COMM_WORLD
#     l_nodenumber   = (l_comm.Get_rank())
#     l_nodecount    = (l_comm.Get_size())
#  
#  
#     def f_dowork():                                                        
#  
#        #
#        #  Do whatever here to generate what you want to return to the calling
#        #  function/client
#        #
#      
#        l_result = np.random.randint(1, 101, 4)
#           #
#        return l_result
#      
#      
#     #  Once this works, this function should not need to change
#     #
#     def f_datasend(i_datasend):
#        if (l_nodenumber > 0):                                               #  Generally; it seems to hang if a node tries to
#           l_comm.isend(i_datasend, dest = 0)                                #  send/recv to itself.
#           print("SENT")
#  
#      
#     #  Once this works, this function should not need to change
#     #
#     def f_datarecv(i_result):
#      
#        l_return = []
#           #
#        if (l_nodenumber == 0):
#           l_return.append(i_result)
#  
#          
#        #  if l_nodenumber == 0:                                            #  This block hangs every time.
#        #     l_datarecv = l_comm.recv(source = 1)                          #
#        #     print(l_datarecv)                                             #  Change the order to be 2, then 1, and it works
#        #     l_return.append(l_datarecv)                                   #  frequently, not always
#        #        #
#        #     l_datarecv = l_comm.recv(source = 2)
#        #     print(l_datarecv)
#        #     l_return.append(l_datarecv)
#          
#        if l_nodenumber == 0:                                               #  This block works semi-frequently , because of the 
#           l_datarecv = l_comm.recv(source = 2)                             #  order; 2 then 1
#           print("RECV")
#           l_return.append(l_datarecv)                                      #     **  Do not use the approach in this cell
#              #
#           l_datarecv = l_comm.recv(source = 1)
#           print("RECV")
#           l_return.append(l_datarecv)
#      
#      
#        return l_return
#      
#      
#              #####################################################
#          
#  
#     l_result    = f_dowork()
#        #
#     f_datasend(l_result)
#        #
#     l_returnall = f_datarecv(l_result)
#  
#  
#     return single_host(host=0, result=l_returnall)
#  
#  
#  my_return = my_graph.run(lambda g: my_func())
#     #
#  print("")
#  display(my_return)


#  Error,
#
#      CancelledError: 

#  Output, when it works,
#
#     Host 0 output:
#     RECV
#     RECV
#     
#     Host 1 output:
#     SENT
#     
#     Host 2 output:
#     SENT
#     
#     [array([13, 77, 17, 87]), array([ 3, 86, 14, 37]), array([70,  4, 17, 93])]


In [None]:

#  Our use case; Working on all nodes, sending results to a single node,
#  so we may return consolidated results to the calling function.
#
#  Approach 1
#
#  MPI labels this "Collective Communication".
#
#     .  MPI expects you have (n) nodes, and want to send all data back
#        to that one node or similar. As such, [ you run one MPI method ] 
#        that itself will change behavior whether it run one the "one" node, 
#        or other nodes.
#
#        We don't need to program that part. MPI does that for us.
#
#     .  For our use case, outlined above, we will use gather().
#        MPI also includes; scatter() broadcast(), and more.
#

def my_func():
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())


            #####################################################
        
        
   def f_dowork():                                                        

      #
      #  Do whatever here to generate what you want to return to the calling
      #  function/client
      #
    
      l_result = np.random.randint(1, 101, 4)
         #
      return l_result
    
    
            #####################################################
        
        
   #  This function should not need to change
   #
   def f_datagather(i_datasend):
      l_return = l_comm.gather(i_datasend, root = 0)
      return l_return
    

   l_result    = f_dowork()
   l_returnall = f_datagather(l_result)
      #
   return single_host(host=0, result=l_returnall)


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display(my_return)


#  Sample output,
#
#     [array([63, 38, 22,  1]), array([23, 85, 24,  3]), array([91, 82, 14, 10])]


In [None]:

#  Same as above; approach 2
#
#  Add,
#     .  Move to an end result of Pandas DataFrames
#        Why ?
#        So that our return columns are easily labeled, and the support
#        methods we inherit.
#
#        We do not shuttle DataFrames from the worker nodes using MPI.
#        What we would receive on node-0 would be an array of DataFrames.
#        
#        Instead we shuttle arrays of dictionaries, and return that. This
#        return array does need its geometry changed, based on how MPI
#        assembles things, but ..
#
#     .  Here we get our (evetually DataFrame data) from KAPI.
#

import numpy as np


def my_func(i_graph, i_label, i_properties):
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())

   #####################################################

   #  This function should not need to change
   #
   def f_datagather(i_datasend):
      l_return = l_comm.gather(i_datasend, root = 0)
      return l_return

   #####################################################
        
        
   def f_dowork(i_graph, i_label, i_properties):                                                        

      #
      #  Do whatever here to generate what you want to return to the calling
      #  function/client
      #
    
      l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}
      l_nodes      = []
         #
      for l_node in i_graph.nodes.masters():
         if (i_label in i_graph.nodes.labels(l_node) ):
                
            #  This works, but .. .. when we have a larger number of 
            #  columns, this would get wordy. Move it to a comprehension
            #  technique.
            #
            #  l_nodes.append( {
            #     "src_node"     : l_nodenumber,                                    #  Since we have it, return also the source node.
            #        #
            #     i_properties[0]: l_node_props[i_properties[0]][l_node],           #  First node property sent into this method
            #     i_properties[1]: l_node_props[i_properties[1]][l_node],           #  Second node property
            #  } ) 
            
            #  Functionally same block as above, but using comprehension
            #
            l_dict = {each: l_node_props[each][l_node] for each in i_properties}    #  Column from the graph, this outputs a dictionary
            l_dict["src_node"] = l_nodenumber                                       #  Adding to the dictionary
               #
            l_nodes.append(l_dict)
            
      return l_nodes
    
    
   #####################################################
        
        
   l_result    = f_dowork(i_graph, i_label, i_properties)                           #  This function does the actual work on the nodes.
      #
   l_returnall = f_datagather(l_result)                                             #  This function gathers all results to a single node.
      #
   return single_host(host=0, result=l_returnall)                                   #  Return to the calling function in Jupyter, other.


print("--")



In [None]:

#  Actually using the cell above,
#


my_collist    = ["id", "airport_name"]                                        #  Properties to return from the graph. These should
                                                                              #  all exist in the graph "label" below.

    
my_datalist = my_graph.run(lambda g: my_func(g, "Airport", my_collist))       #  "Airport" is a node label in our graph.    



#  MPI likes to return lists. And actuallt MPI gather() returns
#  a list of lists; one list for each node, each containing a list
#  of records found.
#
#  And we want DataFrames.
#
my_datalistflat = np.hstack(my_datalist)                                      #  Flatten the list of lists


my_collist2     = my_collist.insert(0, "src_node")                            #  We actually return one additional column; add that
                                                                              #  to the column list
    

my_dataframe = pd.DataFrame.from_records(my_datalistflat, index = my_collist2)
   #
print(tabulate(my_dataframe, headers='keys', tablefmt='psql'))


#  Sample output,
#
#     +----+------------+------+----------------+
#     |    |   src_node | id   | airport_name   |
#     |----+------------+------+----------------|
#     |  0 |          0 | MKE  | Milwaukee      |
#     |  1 |          0 | ORD  | Chicago O-Hare |
#     |  2 |          1 | SJC  | San Jose       |
#     |  3 |          2 | DEN  | Denver         |
#     +----+------------+------+----------------+



In [45]:

from katana.remote import trace


l_logs   = trace.get_all_logs(my_client)


l_log    = trace.get_log(my_client, 0)           #  <class 'katana.remote.trace.ProgressLog'>
   #
print(type(l_log))


l_trace  = trace.assemble_trace(l_log)
   #
#  print(l_trace)



l_report = trace.generate_report(l_trace)
   #
print(l_report)



{
  "data": [
    {
      "traceID": "1a8c104cd0a613f9360ce3f38a210003",
      "spans": [
        {
          "traceID": "1a8c104cd0a613f9360ce3f38a210003",
          "spanID": "6312e914bd089a95",
          "flags": 1,
          "operationName": "katana operation root",
          "references": [
            {
              "refType": "CHILD_OF",
              "traceID": "1a8c104cd0a613f9360ce3f38a210003",
              "spanID": "b826d287-1636-49e8-97c3-650fa9b821d1"
            }
          ],
          "startTime": 1671050318396640,
          "duration": 1666292,
          "tags": [],
          "logs": [
            {
              "timestamp": 1671050318396640,
              "fields": [
                {
                  "key": "message",
                  "type": "string",
                  "value": "katana operation root"
                },
                {
                  "key": "max_mem_gb",
                  "type": "float64",
                  "value": 0.473
               