#  Display options:

In [None]:

#  References for this Notebook,
#
#     Lines 135-139
#     https://github.com/KatanaGraph/solutions/blob/main/fsi/src/katana_ai.py
#     Lines 31-37
#     https://github.com/KatanaGraph/test-datasets/blob/450232fe5739f327ed0795030a4b9d05731586ee/rdg_datasets/two_self_loops_ai/add_features.py
#
#     https://docs.k9h.dev/latest/recipes/udf-recipes.html?highlight=nodes%20get_property
#
#     https://github.com/KatanaGraph/katana-enterprise/blob/master/python/test/integration/remote/remote_operations_test.py
#
#     https://github.com/KatanaGraph/katana-enterprise/tree/master/lonestar/analytics/distributed/experimental



In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


In [None]:

#  When using UDFs, these execute on another host.
#
#  As such, these methods will need to be copied and run locally also.
#
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)


#  Setup stuff: Connectivity

In [None]:

from katana import remote
#  from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


# Working with num_partitions()

In [None]:
#  Even though this graph was made with 3 partitions, and even though certain operations
#  require at least 3 partitions else they fail, you may still see num_partitions = None
#  which can happen on really small graphs, and produce unexpected results below.

print(my_graph.num_partitions)


In [None]:

#  my_graph.repartition(num_partitions = 1)
my_graph.repartition(num_partitions = 3)

print("--")


# UDFs, Part 01: Hello World, Nodes

In [None]:

#  This function will run on every worker node that serves a partition of the current graph.
#
#  If you rerun this function, you will see that the file access time is updated; same PID.
#

def my_func():
    
   import os
      #
   from pathlib import Path

   l_file = "/tmp/01." + str(os.getpid()) + ".txt"
   print(l_file)
       #
   Path(l_file).touch()
    
   return


#  run_result() will not display stdout or errors-
#
#     **  run_result() went away
#
l_result = my_graph.run(lambda g: my_func())


#  Sample output, num_paritions actually = 3
#
#     Host 0 output:  /tmp/01.22481.txt
#     Host 1 output:  /tmp/01.22958.txt
#     Host 2 output:  /tmp/01.22458.txt
#
#  Sample output, num_paritions = None
#
#     Host 0 output:
#     /tmp/01.3158.txt


#  See also,
#
#     return distributed.single_host(host=0, result=False)


In [None]:

#  How to know which worker node your code is running on
#
#  You can run an if/else on this value, obviously.
#

#  Preferred
#
def my_func():
    
   from katana.distributed import MPI

   l_comm              = MPI.COMM_WORLD
   l_logicalnodenumber = (l_comm.Get_rank())
      #
   print(l_logicalnodenumber)


l_result = my_graph.run(lambda g: my_func())


#  Or/also ..
#
def my_func(i_graph):
    
   from katana.distributed import MPI

   l_logicalnodenumber = i_graph.partition_id
      #
   print(l_logicalnodenumber)


l_result = my_graph.run(lambda g: my_func(g))


#  Sample output,
#
#     Host 0 output:
#     0
#     Host 1 output:
#     1
#     Host 2 output:
#     2
#     
#     Host 0 output:
#     0
#     Host 1 output:
#     1
#     Host 2 output:
#     2


In [None]:
# ##################################################################

In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Wahh ?  (Multiples of the same key ?)
#

def my_func(i_graph, i_property):
    
   for l_node_prop in i_graph.nodes.get_property(i_property):
      print(l_node_prop)
            
   return

l_result = my_graph.run(lambda g: my_func(g, "id"))

#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     DEN
#     ORD
#     SJC
#     
#     Host 1 output:
#     MKE
#     ORD
#     PAP
#     
#     Host 2 output:
#     ORD
#     SJC
#     DEN
#     MKE
#     PAP


In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Masters only
#

def my_func(i_graph, i_property):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   # for l_node in i_graph.nodes.mirrors():                     #  Object not found, this was a KAPI 1.0 method
   for l_node in i_graph.nodes.masters():
      print(l_node_prop[l_node])
             
   return

l_result = my_graph.run(lambda g: my_func(g, "id"))


#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     DEN
#     
#     Host 1 output:
#     MKE
#     PAP
#     
#     Host 2 output:
#     ORD
#     SJC


In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Specific labels only  (Airport)
#

def my_func(i_graph, i_property, i_label):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   for l_node in i_graph.nodes(i_label):
      print(l_node_prop[l_node])
             
   return

l_result = my_graph.run(lambda g: my_func(g, "id", "Airport"))

#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     SJC
#     ORD
#     DEN
#     
#     Host 1 output:
#     ORD
#     MKE
#     
#     Host 2 output:
#     MKE
#     DEN
#     ORD
#     SJC


In [None]:
# ##################################################################

In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Filtering both; Masters and Labels   (This errors;  2022-11-26)
#

def my_func(i_graph, i_property, i_label):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   for l_node in i_graph.nodes(i_label).masters():
      print(l_node_prop[l_node])
             
   return

l_result = my_graph.run(lambda g: my_func(g, "id", "Airport"))

#  Sample output, num_partitions = 3
#
#  Does not work
#    AttributeError: 'katana_enterprise.distributed_native.EntitySequenc' object has no attribute 'masters'



In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Adding, Masters and Labels; alternate form to the above
#

def my_func(i_graph, i_property, i_label):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print(l_node_prop[l_node])
     
   return

#  l_result = my_graph.run(lambda g: my_func(g, "id", "Restaurant"))
l_result = my_graph.run(lambda g: my_func(g, "id", "Airport"))

#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     DEN
#     
#     Host 1 output:
#     MKE
#     
#     Host 2 output:
#     ORD
#     SJC


In [None]:
# ##################################################################

In [None]:

#  Working with multiple properties; approach 1
#
#  Manually managing a number of keys
#

def my_func(i_graph, i_label, i_properties):
    
    
   l_node_prop0 = i_graph.nodes.get_property(i_properties[0])
   l_node_prop1 = i_graph.nodes.get_property(i_properties[1])
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print("Given Airport : %3s, %-32s" % (l_node_prop0[l_node], l_node_prop1[l_node]))

   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", ["id", "airport_name"]))

#  Sample output
#
#     Host 0 output:
#     Given Airport : DEN, Denver                          
#     
#     Host 1 output:
#     Given Airport : MKE, Milwaukee                       
#     
#     Host 2 output:
#     Given Airport : ORD, Chicago O-Hare                  
#     Given Airport : SJC, San Jose                        


In [None]:

#  Working with multiple properties; approach 2
#
#  Cleaner approach to managing multiple keys; this construct returns a dictionary
#

def my_func(i_graph, i_label, i_properties):
    
   #  This technique is called 'comprehension'
   #
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print("Given Airport : %3s, %-32s" % (l_node_props[i_properties[0]][l_node], l_node_props[i_properties[1]][l_node]))

   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", ["id", "airport_name"]))

#  Sample output
#
#     Host 0 output:
#     Given Airport : DEN, Denver                          
#     
#     Host 1 output:
#     Given Airport : MKE, Milwaukee                       
#     
#     Host 2 output:
#     Given Airport : ORD, Chicago O-Hare                  
#     Given Airport : SJC, San Jose                        


In [None]:
# ##################################################################

In [None]:

#  Just exapnding the comprehension topic a bit ..
#

def my_func(i_graph, i_label, i_properties):
    
   l_node_prop0 = i_graph.nodes.get_property(i_properties[0])
   l_node_prop1 = i_graph.nodes.get_property(i_properties[1])
      #
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}


   print("What is l_node_prop0: " + str(type(l_node_prop0)) )
   print("What is l_node_props: " + str(type(l_node_props)) )

   print("--")
    
   print("What are the keys in l_node_props: " + str(l_node_props.keys()) )
   print("What is l_node_props['id']: " + str(type(l_node_props['id'])) )

   print("--")

   print("What is the len() of l_node_prop0: " + str(len(l_node_prop0)) )
   print("What is the len() of l_node_props['id']: " + str(len(l_node_props['id'])) )


   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", ["id", "airport_name"]))




# Using help() on server side objects

In [None]:

def my_func(i_graph):
    
   help(i_graph.nodes.labels)
      #
   return

l_result = my_graph.run(lambda g: my_func(g))


# Background: Nested Python functions ..

In [None]:

#  Our code will be getting larger; demonstrate using nested functions with Python

def my_func_HHH(i_graph, i_properties):

   def bbb():
      print("HHH")
    
   print("GGG")
   bbb()
        
   return


l_result = my_graph.run(lambda g: my_func_HHH(g, ["id", "fname", "lname"]))
   #
print(l_result)


#  Sample output,
#
#     Host 0 output:
#     GGG
#     HHH
#     
#     Host 1 output:
#     GGG
#     HHH
#     
#     Host 2 output:
#     GGG
#     HHH
#     None


# Using projections

In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


In [None]:

# my_graph2 = my_graph.project(node_types=["Airport"], edge_types=[None])
my_graph2 = my_graph.project(node_types=["Airport"], edge_types=["FLIES_TO"])

print("--")


In [None]:

display(my_graph2.num_nodes())
display(my_graph2.num_edges())

#  OperationError: 9AgHAAGCdiiUykiKth3cm2rJW4ruWxpLTaf5ofuQkVQx-2dhBp6fa5UWarBkC7 
#  Projected views are not supported for query operation yet. 
#  (QueryOperation.cpp:30): not implemented: not implemented


In [None]:
#  View the graph; verify results

l_result = my_graph2.query("""

   MATCH (n)
   RETURN n
   
   """, contextualize=True)

l_result.view()

#  OperationError: Projected views are not supported for query operation yet. 
#  (QueryOperation.cpp:28): not implemented: not implemented


In [None]:

#  Working with a property, after a projection

def my_func(i_graph, i_property):
    
   for l_node in i_graph.nodes.get_property(i_property):
      print(l_node)
            
   return

l_result = my_graph2.run(lambda g: my_func(g, "id"))

#  Projection not used
#
#     Host 0 output:
#     DEN
#     ORD
#     SJC
#     
#     Host 1 output:
#     MKE
#     ORD
#     PAP
#     
#     Host 2 output:
#     ORD
#     SJC
#     DEN
#     MKE
#     PAP


In [None]:

#  Put the projection inside the UDF


def my_func(i_graph, i_property):
    
   l_graph2 = i_graph.project(node_types=["Airport"], edge_types=["FLIES_TO"])

   for l_node in l_graph2.nodes.get_property(i_property):
      print(l_node)
            
   return
    
    
l_result = my_graph.run(lambda g: my_func(g, "id"))

#  Projection not used
#
#  (Same output as above)


# UDFs, Part 02: Hello World, Edges

In [None]:

#  Working with edge data; approach 1
#
#     .  Just get something working
#

def my_func(i_graph, i_label):
    
   l_node_props = i_graph.nodes.    get_property("id"      )
   l_edge_props = i_graph.out_edges.get_property("DISTANCE")
    
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node)):
         for l_edge in i_graph.out_edges(l_node):
          
            l_node_id       = l_node_props[l_node]
               #
            l_edge_src      = i_graph.out_edges.src(l_edge)
            l_edge_dst      = i_graph.out_edges.dst(l_edge)
               #
            l_edge_distance = l_edge_props[l_edge]
               #
            print("Given Airport: %-3s   Edge Src: %-3s / %-18s   Edge Dst: %-3s / %-18s   Distance: %-8s" % (
               l_node_id, l_node_props[l_edge_src], l_edge_src, l_node_props[l_edge_dst], l_edge_dst, l_edge_distance))
    
   return

l_result = my_graph.run(lambda g: my_func(g, "Airport"))

#  Sample output,     (fyi:  results may look different, as partitioning may be different between loads)
#
#     Host 0 output:
#     Given Airport: MKE   Edge Src: MKE / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 66      
#     
#     Host 1 output:
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: MKE / <lnid 4294967293>    Distance: 66      
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: DEN / <lnid 4294967294>    Distance: 886     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: ORD / <lnid 0>             Distance: 1829    
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: DEN / <lnid 4294967294>    Distance: 948     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: PAP / <lnid 4294967292>    Distance: None    
#     
#     Host 2 output:
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: SJC / <lnid 4294967293>    Distance: 948     
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 886     

#  Above,
#
#     .  Minimally, we show the out edge from SJC Airport to a non-Airport node.
#        This is because we did filter nodes on the Airport label, but looked at [ all ] out edges.
#
#     .  We add additional node and edge properties; move to managing that with less code.


In [None]:

#  Working with edge data; approach 2
#
#     .  Cleaner handling of properties
#     .  Filter out Restaurants
#

def my_func(i_graph, i_label, i_type, i_weight):
    
    
   l_node_props = {each: i_graph.nodes    .get_property(each) for each in [ "id"             ]}
   l_edge_props = {each: i_graph.out_edges.get_property(each) for each in [ "TYPE", i_weight ]}
    
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node)):
         for l_edge in i_graph.out_edges(l_node):
            if(l_edge_props["TYPE"][l_edge] == i_type):
          
               l_node_id       = l_node_props["id"][l_node]
                  #
               l_edge_src      = i_graph.out_edges.src(l_edge)
               l_edge_dst      = i_graph.out_edges.dst(l_edge)
                  #
               l_edge_distance = l_edge_props[i_weight][l_edge]
                  #
               print("Given Airport: %-3s   Edge Src: %-3s / %-18s   Edge Dst: %-3s / %-18s   Distance: %-8s" % (
                  l_node_id, l_node_props["id"][l_edge_src], l_edge_src, l_node_props["id"][l_edge_dst], l_edge_dst, l_edge_distance))
                  
   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", "FLIES_TO", "DISTANCE"))

#  Sample output,
#
#     Host 0 output:
#     Given Airport: MKE   Edge Src: MKE / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 66      
#     
#     Host 1 output:
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: MKE / <lnid 4294967293>    Distance: 66      
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: DEN / <lnid 4294967294>    Distance: 886     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: ORD / <lnid 0>             Distance: 1829    
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: DEN / <lnid 4294967294>    Distance: 948     
#     
#     Host 2 output:
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: SJC / <lnid 4294967293>    Distance: 948     
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 886     


#  UDFs, Part 03: Writing to files and Returns

In [None]:

#  Write to a file that is local to the worker node
#
#     (Why: poor person's debugging maybe ?)
#

def my_func(i_graph, i_property, i_label):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   with open("/tmp/my_debug.txt", "w") as f:
         #
      for l_node in i_graph.nodes.masters():
         if (i_label in i_graph.nodes.labels(l_node) ):
            print(l_node_prop[l_node])
            f.write("Node: " + l_node_prop[l_node] + "\n")
     
   return

l_result = my_graph.run(lambda g: my_func(g, "id", "Airport"))

print(l_result)

#  Sample output,
#
#     root@amusing-fawn-compute-0:~# cat /tmp/my*
#     Node: MKE
#     
#     root@amusing-fawn-compute-1:~# cat /tmp/my*
#     Node: ORD
#     Node: SJC
#     
#     root@amusing-fawn-compute-2:~# cat /tmp/my*
#     Node: DEN


In [None]:

#  Actually returning data
#

def my_func(i_graph, i_property, i_label):
    
   from katana_enterprise.distributed import single_host
      #
   l_result = []
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print(l_node_prop[l_node])
         l_result.append(l_node_prop[l_node])

   #  return l_result
   #
   #     ValueError: Hosts returned more than one value. Please have only one host return a value while other's return a None.
   #     The returned results (index is the host # in list):
   #     [['MKE'], ['ORD', 'SJC'], ['DEN']]
                      
   return single_host(host=0, result=l_result)

l_return = my_graph.run(lambda g: my_func(g, "id", "Airport"))
   #
print(l_return)


#  Sample output,
#
#     Host 0 output:
#     MKE
#     
#     Host 1 output:
#     ORD
#     SJC
#     
#     Host 2 output:
#     DEN
#     ['MKE']


In [None]:

#  Return a list from multiple hosts ..
#


In [None]:

#  Nice example for testing; generates a random dataset
#

def my_func(i_graph, i_properties):
    
   import numpy as np
   from katana_enterprise.distributed import single_host


   l_return = np.random.randint(1, 101, 4)                #  Generate an array[4] of random numbers in the range of 1-100
   print(l_return)
    
   return single_host(host=0, result=l_return)


my_return = my_graph.run(lambda g: my_func(g, ["xxx", "yyy"]))
   #
display("--")
display(my_return)


#  Sample output,
#
#     Host 0 output:
#     [ 5 14 50 87]
#     Host 1 output:
#     [15 50 81 79]
#     Host 2 output:
#     [48 36 97 66]
#     
#     '--'
#     
#     array([ 5, 14, 50, 87])


In [None]:

#  Another example; building a DataFrame which gets returned
#

def my_func(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   ##################################################################

   #  Nested function, load a Python DataFrame from the KG graph
    
   def l_load_df(i_graph, i_properties):
      l_nodes = []
    
      for l_property in i_properties:
         l_node = i_graph.nodes.get_property(l_property).to_numpy().reshape(-1, 1)
         l_nodes.append(l_node)
       
      l_array     = np.hstack(l_nodes)
      l_dataframe = pd.DataFrame(l_array, columns=i_properties)
         #
    
      return l_dataframe
    
   ##################################################################

   #  Nested function, add a new, derived column to the DataFrame
    
   def l_enrich_df(i_dataframe, i_source_property, i_new_property):
    
      def l_to_upper(i_str):
         if i_str.get(i_source_property) is not None:
            return str(i_str[i_source_property]).upper()
         else:
            return
    
      i_dataframe[i_new_property] = i_dataframe.apply(l_to_upper, axis=1)
    
      return

   ##################################################################
    
    
   l_dataframe = l_load_df(i_graph, i_properties)
      #
   l_enrich_df(l_dataframe, "airport_name", "airport_name_upper")

        
   return single_host(host=0, result=l_dataframe)
   #  return l_dataframe


l_result = my_graph.run(lambda g: my_func(g, ["id", "airport_name", "restaurant_name"]))
   #
print(l_result)


#  Sample output,
#
#         id    airport_name              restaurant_name airport_name_upper
#     0  SJC        San Jose                         None           SAN JOSE
#     1  DEN          Denver                         None             DENVER
#     2  ORD  Chicago O-Hare                         None     CHICAGO O-HARE
#     3  PAP            None  Pappadeauxs Seafood Kitchen               None


#  UDFs, Part 04: Cypher on Distributed

In [35]:

#  Running a Cypher query locally, on each worker
#
#     .  A library we use frequently, tabulate, is not installed on the worker nodes
#

def my_func(i_graph):
    
   from katana_enterprise.distributed import single_host

   l_query  = """
      MATCH (n)
      RETURN n
      """.format()
   
   l_result = i_graph.query(l_query)
      #
   try:
      for l_each in l_result.itertuples():
         print(l_each)
   except:
      print("No rows on this node.")

   return

#  return single_host(host=0, result=l_result)


l_return = my_graph.run(lambda g: my_func(g))
   #
print(l_return)




          0/? [?op/s]


Host 0 output:
No rows on this node.



Host 1 errors:
Traceback (most recent call last):
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/worker/worker.py", line 86, in execute
    value = function(graph)
  File "/tmp/ipykernel_142/4238914162.py", line 25, in <lambda>
  File "/tmp/ipykernel_142/4238914162.py", line 17, in my_func
AttributeError: 'katana_enterprise.distributed_native.ResultSet' object has no attribute 'itertuples'

Host 2 errors:
Traceback (most recent call last):
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/worker/worker.py", line 86, in execute
    value = function(graph)
  File "/tmp/ipykernel_142/4238914162.py", line 25, in <lambda>
  File "/tmp/ipykernel_142/4238914162.py", line 17, in my_func
AttributeError: 'katana_enterprise.distributed_native.ResultSet' object has no attribute 'itertuples'


AttributeError: 'katana_enterprise.distributed_native.ResultSet' object has no attribute 'itertuples'