#  Display options:

In [1]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


--


In [None]:

#  When using UDFs, these execute on another host.
#
#  As such, these methods will need to be copied and run locally also.
#
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)


#  Setup stuff: Connectivity

In [2]:

from katana import remote

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f4f4c314520>


In [3]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


--


In [4]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


<_Graph my_graph, H2oBBmL3EBBp33jGQ2h63qEct3NXfGc2UNcZNZPYfVsH, 70>


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


In [None]:
#  Even though this graph was made with 3 partitions, and even though certain operations
#  require at least 3 partitions else they fail, you may still see num_partitions = None
#  which can happen on really small graphs, and produce unexpected results below.

print(my_graph.num_partitions)


In [None]:

#  my_graph.repartition(num_partitions = 1)
my_graph.repartition(num_partitions = 3)

print("--")


In [None]:

def my_func(i_graph):
    
   help(i_graph.nodes.labels)
      #
   return

l_result = my_graph.run(lambda g: my_func(g))


In [None]:

#  Our code will be getting larger; demonstrate using nested functions with Python

def my_func_HHH(i_graph, i_properties):

   def bbb():
      print("HHH")
    
   print("GGG")
   bbb()
        
   return


l_result = my_graph.run(lambda g: my_func_HHH(g, ["id", "fname", "lname"]))
   #
print(l_result)


#  Sample output,
#
#     Host 0 output:
#     GGG
#     HHH
#     
#     Host 1 output:
#     GGG
#     HHH
#     
#     Host 2 output:
#     GGG
#     HHH
#     None


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


In [None]:

# my_graph2 = my_graph.project(node_types=["Airport"], edge_types=[None])
my_graph2 = my_graph.project(node_types=["Airport"], edge_types=["FLIES_TO"])

print("--")


In [None]:

display(my_graph2.num_nodes())
display(my_graph2.num_edges())

#  OperationError: 9AgHAAGCdiiUykiKth3cm2rJW4ruWxpLTaf5ofuQkVQx-2dhBp6fa5UWarBkC7 
#  Projected views are not supported for query operation yet. 
#  (QueryOperation.cpp:30): not implemented: not implemented


In [None]:
#  View the graph; verify results

l_result = my_graph2.query("""

   MATCH (n)
   RETURN n
   
   """, contextualize=True)

l_result.view()

#  OperationError: Projected views are not supported for query operation yet. 
#  (QueryOperation.cpp:28): not implemented: not implemented


In [None]:

#  Working with a property, after a projection

def my_func(i_graph, i_property):
    
   for l_node in i_graph.nodes.get_property(i_property):
      print(l_node)
            
   return

l_result = my_graph2.run(lambda g: my_func(g, "id"))

#  Projection not used
#
#     Host 0 output:
#     DEN
#     ORD
#     SJC
#     
#     Host 1 output:
#     MKE
#     ORD
#     PAP
#     
#     Host 2 output:
#     ORD
#     SJC
#     DEN
#     MKE
#     PAP


In [None]:

#  Put the projection inside the UDF


def my_func(i_graph, i_property):
    
   l_graph2 = i_graph.project(node_types=["Airport"], edge_types=["FLIES_TO"])

   for l_node in l_graph2.nodes.get_property(i_property):
      print(l_node)
            
   return
    
    
l_result = my_graph.run(lambda g: my_func(g, "id"))

#  Projection not used
#
#  (Same output as above)


In [None]:

#  Working with edge data; approach 1
#
#     .  Just get something working
#

def my_func(i_graph, i_label):
    
   l_node_props = i_graph.nodes.    get_property("id"      )
   l_edge_props = i_graph.out_edges.get_property("DISTANCE")
    
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node)):
         for l_edge in i_graph.out_edges(l_node):
          
            l_node_id       = l_node_props[l_node]
               #
            l_edge_src      = i_graph.out_edges.src(l_edge)
            l_edge_dst      = i_graph.out_edges.dst(l_edge)
               #
            l_edge_distance = l_edge_props[l_edge]
               #
            print("Given Airport: %-3s   Edge Src: %-3s / %-18s   Edge Dst: %-3s / %-18s   Distance: %-8s" % (
               l_node_id, l_node_props[l_edge_src], l_edge_src, l_node_props[l_edge_dst], l_edge_dst, l_edge_distance))
    
   return

l_result = my_graph.run(lambda g: my_func(g, "Airport"))

#  Sample output,     (fyi:  results may look different, as partitioning may be different between loads)
#
#     Host 0 output:
#     Given Airport: MKE   Edge Src: MKE / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 66      
#     
#     Host 1 output:
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: MKE / <lnid 4294967293>    Distance: 66      
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: DEN / <lnid 4294967294>    Distance: 886     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: ORD / <lnid 0>             Distance: 1829    
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: DEN / <lnid 4294967294>    Distance: 948     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: PAP / <lnid 4294967292>    Distance: None    
#     
#     Host 2 output:
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: SJC / <lnid 4294967293>    Distance: 948     
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 886     

#  Above,
#
#     .  Minimally, we show the out edge from SJC Airport to a non-Airport node.
#        This is because we did filter nodes on the Airport label, but looked at [ all ] out edges.
#
#     .  We add additional node and edge properties; move to managing that with less code.


In [None]:

#  Working with edge data; approach 2
#
#     .  Cleaner handling of properties
#     .  Filter out Restaurants
#

def my_func(i_graph, i_label, i_type, i_weight):
    
    
   l_node_props = {each: i_graph.nodes    .get_property(each) for each in [ "id"             ]}
   l_edge_props = {each: i_graph.out_edges.get_property(each) for each in [ "TYPE", i_weight ]}
    
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node)):
         for l_edge in i_graph.out_edges(l_node):
            if(l_edge_props["TYPE"][l_edge] == i_type):
          
               l_node_id       = l_node_props["id"][l_node]
                  #
               l_edge_src      = i_graph.out_edges.src(l_edge)
               l_edge_dst      = i_graph.out_edges.dst(l_edge)
                  #
               l_edge_distance = l_edge_props[i_weight][l_edge]
                  #
               print("Given Airport: %-3s   Edge Src: %-3s / %-18s   Edge Dst: %-3s / %-18s   Distance: %-8s" % (
                  l_node_id, l_node_props["id"][l_edge_src], l_edge_src, l_node_props["id"][l_edge_dst], l_edge_dst, l_edge_distance))
                  
   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", "FLIES_TO", "DISTANCE"))

#  Sample output,
#
#     Host 0 output:
#     Given Airport: MKE   Edge Src: MKE / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 66      
#     
#     Host 1 output:
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: MKE / <lnid 4294967293>    Distance: 66      
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: DEN / <lnid 4294967294>    Distance: 886     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: ORD / <lnid 0>             Distance: 1829    
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: DEN / <lnid 4294967294>    Distance: 948     
#     
#     Host 2 output:
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: SJC / <lnid 4294967293>    Distance: 948     
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 886     


#  UDFs, Part 01: Just MPI

In [None]:

#  Best practice; Have a second NoteBook open with the Operations Widget ready-
#
#  Why ?
#     When you make mistakes below, you will want to kill the Python job that is running
#     on the worker nodes, the UDFs you submit below-
#
#  Source,
#     https://mpi4py.readthedocs.io/en/stable/tutorial.html


In [None]:

#  Example for testing; generates a random dataset
#

def my_func():
    
   #  from katana_enterprise.distributed import single_host
   from katana.distributed import single_host
      #
   import numpy as np


   l_return = np.random.randint(1, 101, 4)                                #  Generate an array[4] of random numbers in the range of 1-100
      #                                                                   #     <class 'numpy.ndarray'>
   print(l_return)
    
    
   #  return l_return
   #
   #     ValueError: Hosts returned more than one value. Please have only one host return a value while other's return a None.
   #     The returned results (index is the host # in list):
   #     [array([71, 25, 37, 40]), array([72, 19, 18, 63]), array([34, 39, 44, 59])]


   return single_host(host=0, result=l_return)


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display("My return: %s" % (str(my_return)))


#  Sample output,
#
#     Host 0 output:
#     [80 68 81 23]
#     
#     Host 1 output:
#     [61 17 47 34]
#     
#     Host 2 output:
#     [11 93 11 57]
#     
#     'My return: [80 68 81 23]'


In [None]:

#  Knowing which worker node your code is running on; this approach is preferred
#

def my_func():
    
   from katana.distributed import MPI

   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())
      #
   print("This node number: %d   Count of all Nodes: %d" % (l_nodenumber, l_nodecount))

   return


l_result = my_graph.run(lambda g: my_func())
   #
print("--")

#  Sample output,
#
#     Host 0 output:
#     This node number: 0   Count of all Nodes: 3
#     
#     Host 1 output:
#     This node number: 1   Count of all Nodes: 3
#     
#     Host 2 output:
#     This node number: 2   Count of all Nodes: 3


In [5]:

#  Example using inter worker node data passing; approach 1
#
#     This approach only works single node to single node. 
#     If you use this approach, for example, (n) nodes sending to node-0, the function
#     will hang.
#
#     Totally expected; this technique is listed as point-to-point in the documentation.
#

def my_func():
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())
      #
   l_return       = None

    
   if (l_nodenumber == 0):
      l_datarecv = l_comm.recv(source = 1)
      print(l_datarecv)
      l_return = l_datarecv
   elif (l_nodenumber == 1):
      l_datasend = np.random.randint(1, 101, 4)
      l_comm.send(l_datasend, dest = 0)
    
   return single_host(host=0, result=l_return)


my_return = my_graph.run(lambda g: my_func())
   #
print("")
display(my_return)


#  Sample output,
#
#     Host 0 output:
#     [57  2 16 73]
#     
#     array([57,  2, 16, 73])


          0/? [?op/s]


Host 0 output:
[86 92 29 24]



array([86, 92, 29, 24])

In [10]:

#  Example using inter worker node data passing; approach 2
#
#  Change,
#
#     .  Do work on all nodes
#     .  We'll use (broadcast), since mid-processing, we might want to send
#        intermediate results to all nodes.
#     .  Concatenate all results and send those back to the calling function
#

def my_func():
    
   from katana.distributed import single_host
   from katana.distributed import MPI
      #
   import numpy as np


   l_comm         = MPI.COMM_WORLD
   l_nodenumber   = (l_comm.Get_rank())
   l_nodecount    = (l_comm.Get_size())
      #
   l_return       = None


   def f_dowork():

      #  Do whatever here to generate what you want to return to the calling
      #  function/client
      #
      l_result = np.random.randint(1, 101, 4)
         #
      print(l_result)
      return l_result
    
    
   def f_datasend(i_datasend):
      l_comm.bcast(i_datasend, root = 0)
      print("SENT")

    
   def f_datarecv(i_datarecv):
      l_return = []
         #
      if (l_nodenumber == 0):
         l_return.append(i_datarecv)
            #
         l_datarecv = l_comm.gather(data, root = 0)
         print("RECV")
            #
      return l_return
    
    

   l_result = f_dowork()
      #
   f_datasend(l_result)
      #
   l_returnall = f_datarecv(l_result)


   return single_host(host=0, result=l_returnall)



my_return = my_graph.run(lambda g: my_func())
   #
print("")
display(my_return)





          0/? [?op/s]


Host 0 output:
[ 3 62 68 68]
SENT

Host 1 output:
[15 79 35 21]
SENT

Host 2 output:
[77 11 51 83]
SENT



Host 0 errors:
Traceback (most recent call last):
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/worker/worker.py", line 86, in execute
    value = function(graph)
  File "/tmp/ipykernel_7027/2093482287.py", line 66, in <lambda>
  File "/tmp/ipykernel_7027/2093482287.py", line 59, in my_func
  File "/tmp/ipykernel_7027/2093482287.py", line 48, in f_datarecv
NameError: name 'data' is not defined


NameError: name 'data' is not defined

In [None]:



from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

sendbuf = np.zeros(100, dtype='i') + rank
recvbuf = None
if rank == 0:
    recvbuf = np.empty([size, 100], dtype='i')
comm.Gather(sendbuf, recvbuf, root=0)
if rank == 0:
    for i in range(size):
        assert np.allclose(recvbuf[i,:], i)
        
        
        

In [None]:

def my_func(i_graph, i_properties):
    
   import numpy as np
   from katana_enterprise.distributed import single_host


   l_return = np.random.randint(1, 101, 4)
   print(l_return)


   #  return single_host(host=0, result=l_return)


my_return = my_graph.run(lambda g: my_func(g, ["xxx", "yyy"]))
   #
display("--")
display(my_return)


In [None]:

l_nodes = []
print(type(l_nodes))


In [None]:

#  Another example; building a DataFrame which gets returned
#

def my_func(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   ##################################################################

   #  Nested function, load a Python DataFrame from the KG graph
    
   def l_load_df(i_graph, i_properties):
      l_nodes = []
    
      for l_property in i_properties:
         l_node = i_graph.nodes.get_property(l_property).to_numpy().reshape(-1, 1)
         l_nodes.append(l_node)
       
      l_array     = np.hstack(l_nodes)
      l_dataframe = pd.DataFrame(l_array, columns=i_properties)
         #
    
      return l_dataframe
    
   ##################################################################

   #  Nested function, add a new, derived column to the DataFrame
    
   def l_enrich_df(i_dataframe, i_source_property, i_new_property):
    
      def l_to_upper(i_str):
         if i_str.get(i_source_property) is not None:
            return str(i_str[i_source_property]).upper()
         else:
            return
    
      i_dataframe[i_new_property] = i_dataframe.apply(l_to_upper, axis=1)
    
      return

   ##################################################################
    
    
   l_dataframe = l_load_df(i_graph, i_properties)
      #
   l_enrich_df(l_dataframe, "airport_name", "airport_name_upper")

        
   return single_host(host=0, result=l_dataframe)
   #  return l_dataframe


l_result = my_graph.run(lambda g: my_func(g, ["id", "airport_name", "restaurant_name"]))
   #
print(l_result)


#  Sample output,
#
#         id    airport_name              restaurant_name airport_name_upper
#     0  SJC        San Jose                         None           SAN JOSE
#     1  DEN          Denver                         None             DENVER
#     2  ORD  Chicago O-Hare                         None     CHICAGO O-HARE
#     3  PAP            None  Pappadeauxs Seafood Kitchen               None
