#  Display options:

In [None]:

#  References for this Notebook,
#
#     Lines 135-139
#     https://github.com/KatanaGraph/solutions/blob/main/fsi/src/katana_ai.py
#     Lines 31-37
#     https://github.com/KatanaGraph/test-datasets/blob/450232fe5739f327ed0795030a4b9d05731586ee/rdg_datasets/two_self_loops_ai/add_features.py
#
#     https://docs.k9h.dev/latest/recipes/udf-recipes.html?highlight=nodes%20get_property
#
#     https://github.com/KatanaGraph/katana-enterprise/blob/master/python/test/integration/remote/remote_operations_test.py
#
#     https://github.com/KatanaGraph/katana-enterprise/tree/master/lonestar/analytics/distributed/experimental



In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


In [None]:

#  When using UDFs, these execute on another host.
#
#  As such, these methods will need to be copied and run locally also.
#
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)


#  Setup stuff: Connectivity

In [None]:

from katana import remote
#  from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


# UDFs, Part 01: Mutating Nodes 

In [None]:

#  We will use this counter to generate a unique column name below-
#
l_cntr = 20

print("--")


In [None]:

#  Add a new column to each node, one that is derived from columns local to each node; approach 1
#
#     .  This will fail if the named new column already exists in the graph.
#     .  Just get this working, then make it more functional.
#

def my_func(i_graph, i_label, i_properties, i_new_colname):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = i_graph.nodes.get_property(i_properties[0])                     #  An existing column, we will upshift this existing value
      #
   l_node_keys  = l_node_props.keys()                                             #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props), dtype="object")                     #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                  #  Loop over the keys that are already in the graph
      l_new_cols[l_index] = str(l_node_props[l_key]).upper()                      #  Derive a new column property value

    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props, i_new_colname)        #  Building what we need to send to add_property()
   i_graph.nodes.add_property(l_keyed_cols)


   return

l_cntr   += 1
l_newcol =  "newcol_" + str(l_cntr).zfill(4)

   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", ["airport_name"], l_newcol))


print("--")


In [None]:

#  Add a new column to each node, one that is derived from columns local to each node; approach 2
#
#     .  We moved properties to our comprehension technique. Why ?  When dealing with multiple
#        properties, this is handy.
#

def my_func(i_graph, i_label, i_properties, i_new_colname):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}                 #  An existing column, we will upshift this existing value
      #
   l_node_keys  = l_node_props[i_properties[0]].keys()                                              #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props[i_properties[0]]), dtype="object")                      #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                                    #  Loop over the keys that are already in the graph
      l_new_cols[l_index] = str(l_node_props[i_properties[0]][l_key]).upper()                       #  Derive a new column property value

    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props[i_properties[0]], i_new_colname)         #  Building what we need to send to add_property()
      #
   i_graph.nodes.add_property(l_keyed_cols)


   return

l_cntr   += 1
l_newcol =  "newcol_" + str(l_cntr).zfill(4)

   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", ["airport_name"], l_newcol))


print("--")


In [None]:

#  View the graph; verify results
#
#  Based on our return list, not easily certain if new column was added to Restaurant
#

def f_printtable1(i_colname):
    
   l_query  = """
      MATCH (n)
      RETURN n.restaurant_name, n.airport_name, n.{0}
      """.format(i_colname)
   
   l_result = my_graph.query(l_query)
   
   print(tabulate(l_result, headers='keys', tablefmt='psql'))


f_printtable1(l_newcol)

#  Sample output,
#
#     +----+-----------------------------+------------------+-----------------+
#     |    | n.restaurant_name           | n.airport_name   | n.newcol_0021   |
#     |----+-----------------------------+------------------+-----------------|
#     |  0 |                             | San Jose         | SAN JOSE        |
#     |  1 |                             | Chicago O-Hare   | CHICAGO O-HARE  |
#     |  2 |                             | Milwaukee        | MILWAUKEE       |
#     |  3 | Pappadeauxs Seafood Kitchen |                  | NONE            |
#     |  4 |                             | Denver           | DENVER          |
#     +----+-----------------------------+------------------+-----------------+


In [None]:

#  View the graph; verify results
#
#  Confirming new column was added to Restaurant
#

def f_printtable2(i_colname):
    
   l_query  = """
      MATCH (n: Restaurant)
      // RETURN n.restaurant_name, n.{0}
      RETURN n
      """.format(i_colname)
   
   l_result = my_graph.query(l_query)
   
   print(tabulate(l_result, headers='keys', tablefmt='psql'))


f_printtable2(l_newcol)

#  Sample output,
#
#     +----+-----------------+----------------+--------+-----------------+-----------------------------+----------+
#     |    |   n.internal_id | n.labels       | n.id   | n.newcol_0021   | n.restaurant_name           | n.type   |
#     |----+-----------------+----------------+--------+-----------------+-----------------------------+----------|
#     |  0 | 562949953421313 | ['Restaurant'] | PAP    | NONE            | Pappadeauxs Seafood Kitchen | node     |
#     +----+-----------------+----------------+--------+-----------------+-----------------------------+----------+


In [None]:

#  Add a new column to each node, one that is derived from columns local to each node; approach 3
#
#     .  Here we see that if the new column value is None, then the property is not actually added
#        to the node. 
#

def my_func(i_graph, i_label, i_properties, i_new_colname):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}                 #  An existing column, we will upshift this existing value
      #
   l_node_keys  = l_node_props[i_properties[0]].keys()                                              #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props[i_properties[0]]), dtype="object")                      #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                                    #  Loop over the keys that are already in the graph
      if (i_label in i_graph.nodes.labels(l_key) ):                                                   
         #  Is Airport
         if (str(l_node_props[i_properties[0]][l_key]).upper() < "M"):                              #  Further testing what we can set to None
            l_new_cols[l_index] = str(l_node_props[i_properties[0]][l_key]).upper()
         else:
            l_new_cols[l_index] = None
      else:
         #  Not Airport
         l_new_cols[l_index] = None
    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props[i_properties[0]], i_new_colname)         #  Building what we need to send to add_property()
      #
   i_graph.nodes.add_property(l_keyed_cols)


   return

l_cntr   += 1
l_newcol =  "newcol_" + str(l_cntr).zfill(4)

   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", ["airport_name"], l_newcol))

print("--")


In [None]:

#  Checking results above
#

f_printtable1(l_newcol)
f_printtable2(l_newcol)


#  Here you should see a new property in the first table, but not in this table.
#  Basically; the nodes did not receive the property.
#

l_query  = """
   MATCH (n: Airport)
   WHERE n.airport_name > "M"
   RETURN n
   """.format(l_newcol)

l_result = my_graph.query(l_query)

print(tabulate(l_result, headers='keys', tablefmt='psql'))

#  Sample output,
#
#     +----+-----------------------------+------------------+-----------------+
#     |    | n.restaurant_name           | n.airport_name   | n.newcol_0026   |
#     |----+-----------------------------+------------------+-----------------|
#     |  0 |                             | San Jose         |                 |
#     |  1 |                             | Chicago O-Hare   | CHICAGO O-HARE  |
#     |  2 | Pappadeauxs Seafood Kitchen |                  |                 |
#     |  3 |                             | Milwaukee        |                 |
#     |  4 |                             | Denver           | DENVER          |
#     +----+-----------------------------+------------------+-----------------+
#     
#     +----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+
#     |    |   n.internal_id | n.labels       | n.id   | n.newcol_0021   | n.newcol_0022   | n.newcol_0023   | n.restaurant_name           | n.type   |
#     |----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------|
#     |  0 |               2 | ['Restaurant'] | PAP    | NONE            | NONE            | NONE            | Pappadeauxs Seafood Kitchen | node     |
#     +----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+
#     
#     +----+-----------------+-------------+-----------+------------------+--------+-----------------+-----------------+-----------------+-----------------+----------+
#     |    |   n.internal_id | n.labels    | n.LABEL   | n.airport_name   | n.id   | n.newcol_0021   | n.newcol_0022   | n.newcol_0023   | n.newcol_0024   | n.type   |
#     |----+-----------------+-------------+-----------+------------------+--------+-----------------+-----------------+-----------------+-----------------+----------|
#     |  0 |               1 | ['Airport'] | Airport   | San Jose         | SJC    | SAN JOSE        | SAN JOSE        | SAN JOSE        | SAN JOSE        | node     |
#     |  1 | 281474976710656 | ['Airport'] | Airport   | Milwaukee        | MKE    | MILWAUKEE       | MILWAUKEE       | MILWAUKEE       | MILWAUKEE       | node     |
#     +----+-----------------+-------------+-----------+------------------+--------+-----------------+-----------------+-----------------+-----------------+----------+


In [None]:

#  Add a new column to each node, one that is derived from columns local to each node; approach 4  (copy from Approach 2)
#
#     .  Move to upsert
#        Where add_property() could break on (column already found), upsert_property doesn't suffer that restriction.
#

def my_func(i_graph, i_label, i_properties, i_new_colname):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}                 #  An existing column, we will upshift this existing value
      #
   l_node_keys  = l_node_props[i_properties[0]].keys()                                              #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props[i_properties[0]]), dtype="object")                      #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                                    #  Loop over the keys that are already in the graph
      l_new_cols[l_index] = str(l_node_props[i_properties[0]][l_key]).upper()                       #  Derive a new column property value

    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props[i_properties[0]], i_new_colname)         #  Building what we need to send to add_property()
      #
   i_graph.nodes.upsert_property(l_keyed_cols)


   return


l_cntr   += 1
l_newcol =  "newcol_" + str(l_cntr).zfill(4)
   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", ["airport_name"], l_newcol))


f_printtable1(l_newcol)
f_printtable2(l_newcol)

print("--")


In [None]:
# ##################################################################

In [None]:

#  Deleting a property from Nodes; approach 1
#
#     .  From the examples above, we could easily project that we could delete a column by setting
#        its value to None.
#
#     .  But there is also, remove_property()
#        This will work if the column is found on some Nodes. 
#        This will fail if the column is found on no Nodes.
#
#  We'll do both, starting with remove_property()
#

def my_func(i_graph, i_delcol):
    
   i_graph.nodes.remove_property(i_delcol)

   return


#  We wont increment l_cntr as we usually do, and instead leave it set to it's
#  last value
#
#  l_newcol =  "newcol_" + str(l_cntr).zfill(4)
l_newcol = "newcol_0022"

l_result = my_graph.run(lambda g: my_func(g, l_newcol))


print("--")


In [None]:

#  Output a raw listing of all columns, for all nodes
#
l_query  = """
   MATCH (n)
   RETURN DISTINCT LABELS(n) AS label, KEYS(n)
   """.format()

l_result = my_graph.query(l_query)
   #
print(tabulate(l_result, headers='keys', tablefmt='psql'))

#  SAmple output,
#
#     +----+----------------+------------------------------------------------------------------------------------------------------------+
#     |    | label          | KEYS(n)                                                                                                    |
#     |----+----------------+------------------------------------------------------------------------------------------------------------|
#     |  0 | ['Airport']    | ['LABEL', 'airport_name', 'id', 'newcol_0021', 'newcol_0022', 'newcol_0023', 'newcol_0024']                |
#     |  1 | ['Airport']    | ['LABEL', 'airport_name', 'id', 'newcol_0021', 'newcol_0022', 'newcol_0023', 'newcol_0024', 'newcol_0026'] |
#     |  2 | ['Restaurant'] | ['id', 'restaurant_name', 'newcol_0021', 'newcol_0022', 'newcol_0023']                                     |
#     +----+----------------+------------------------------------------------------------------------------------------------------------+


In [None]:

#  Deleting a property from Nodes; approach 2
#
#  We'll show deleting a property just from one Node type; Airport
#


def my_func(i_graph, i_label, i_delcol):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = i_graph.nodes.get_property(i_delcol)
      #
   l_node_keys  = l_node_props.keys()                                                               #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props), dtype="object")                                       #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                                    #  Loop over the keys that are already in the graph
      if (i_label in i_graph.nodes.labels(l_key) ):                                                   
         #  Is Airport
         l_new_cols[l_index] = None
      else:
         #  Not Airport
         l_new_cols[l_index] = l_node_props[l_key]
    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props, i_delcol)                               #  Building what we need to send to upsert_property()
      #
   i_graph.nodes.upsert_property(l_keyed_cols)


   return


#  l_newcol =  "newcol_" + str(l_cntr).zfill(4)
l_newcol = "newcol_0021"
#  l_newcol = "newcol_0022"
   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", l_newcol))

print("--")


# UDFs, Part 02: Mutating Edges 

In [None]:

#  Adding a new property to the edge,
#
#     .  Currently we have a DISTANCE column on most edges.
#           (There is no DISTANCE on the edge to our Restaurant.)
#
#        Copy this value into a new column titled, DISTANCE_MILES
#        Also calculate and copy this value into a new column titled, DISTANCE_KMS
#
#     .  Really we are following the very same pattern used above when adding a
#        property to Nodes.
#


def my_func(i_graph, i_srccol, i_outcol1, i_outcol2):
    
   from katana.distributed import KeyedColumnEdge
   import numpy as np
    
    
   l_edge_props = i_graph.out_edges.get_property(i_srccol)
      #
   l_edge_keys  = l_edge_props.keys()
      #
   l_new_col1   = np.zeros(len(l_edge_props), dtype="object")              #  We use this for Miles
   l_new_col2   = np.zeros(len(l_edge_props), dtype="object")              #  We use this for KMs


   for l_index, l_key in enumerate(l_edge_keys): 
      if(l_edge_props[l_key] is None):
         l_new_col1[l_index] = None
         l_new_col2[l_index] = None
      else:
         l_new_col1[l_index] =      l_edge_props[l_key]
         l_new_col2[l_index] =  int(l_edge_props[l_key] * 1.60934)
    
    
   l_keyed_col1 = KeyedColumnEdge(l_new_col1, l_edge_props, i_outcol1)     #  Miles
      #
   i_graph.out_edges.upsert_property(l_keyed_col1)

   l_keyed_col2 = KeyedColumnEdge(l_new_col2, l_edge_props, i_outcol2)     #  Kilometers
      #
   i_graph.out_edges.upsert_property(l_keyed_col2)


   #  We could also drop the initial source column, but leaving it allows us to re-run
   #  this code over and over.
   #
   #  The delete would not look like,
   #
   #     i_graph.nodes.remove_property(i_srccol)
   #
   #  because DISTANCE is not on every edge record.
   #
   #  We would need a loop very close to our second approach for column add above,
   #  and set every value to None.


   return


l_cntr    += 1
   #
l_newcol1 =  "DIST_MILES_" + str(l_cntr).zfill(4)
l_newcol2 =  "DIST_KMS_"   + str(l_cntr).zfill(4)
   #
l_result  = my_graph.run(lambda g: my_func(g, "DISTANCE", l_newcol1, l_newcol2))

print("--")


In [None]:

#  Check results
#
#  Still want to add these to the output
#
#     r.start.id 
#     r.start.labels 
#     r.end.id
#     r.end.labels  
#

l_query  = """

   MATCH (n) - [r] -> (m)
   // RETURN TYPE(r) AS my_type, ID(r) AS my_id, r.DISTANCE AS DISTANCE, r.NUM_HOPS AS NUM_HOPS, r.{0} AS {1}, r.{2} AS {3}, ID(startNode(r)) AS start_node_id, ID(endNode(r)) AS end_node_id 
   RETURN startNode(r) AS start_node, endNode(r) AS end_node
   
   """.format(l_newcol1, l_newcol1, l_newcol2, l_newcol2)

l_result = my_graph.query(l_query)

print(tabulate(l_result, headers='keys', tablefmt='psql'))

#  Sample output,
#
#     +----+------------+-----------------+------------+------------+-------------------+-----------------+-----------------+-----------------+
#     |    | my_type    |           my_id |   DISTANCE |   NUM_HOPS |   DIST_MILES_0021 |   DIST_KMS_0021 |   start_node_id |     end_node_id |
#     |----+------------+-----------------+------------+------------+-------------------+-----------------+-----------------+-----------------|
#     |  0 | FLIES_TO   |               1 |         66 |          1 |                66 |             106 |               0 |               1 |
#     |  1 | FLIES_TO   |               0 |         66 |          1 |                66 |             106 |               1 |               0 |
#     |  2 | FLIES_TO   |               2 |        886 |          1 |               886 |            1425 |               1 | 562949953421312 |
#     |  3 | FLIES_TO   | 281474976710656 |       1829 |          1 |              1829 |            2943 | 281474976710656 |               1 |
#     |  4 | FLIES_TO   | 281474976710657 |        948 |          1 |               948 |            1525 | 281474976710656 | 562949953421312 |
#     |  5 | ATTRACTION | 281474976710658 |        nan |        nan |               nan |             nan | 281474976710656 | 281474976710657 |
#     |  6 | FLIES_TO   | 562949953421312 |        948 |          1 |               948 |            1525 | 562949953421312 | 281474976710656 |
#     |  7 | FLIES_TO   | 562949953421313 |        886 |          1 |               886 |            1425 | 562949953421312 |               1 |
#     +----+------------+-----------------+------------+------------+-------------------+-----------------+-----------------+-----------------+


In [None]:

#  Checking just Restaurant
#

l_query  = """

   MATCH (n) - [r] -> (m: Restaurant)
   RETURN r
   
   """.format(l_newcol1, l_newcol1, l_newcol2, l_newcol2)

l_result = my_graph.query(l_query)

print(tabulate(l_result, headers='keys', tablefmt='psql'))

#  Sample output
#
#     +----+-----------------+----------------+-----------------+------------+--------------+-----------------+------------------+----------+
#     |    |        r.end.id | r.end.labels   |   r.internal_id | r.label    | r.DRIVABLE   |      r.start.id | r.start.labels   | r.type   |
#     |----+-----------------+----------------+-----------------+------------+--------------+-----------------+------------------+----------|
#     |  0 | 281474976710657 | ['Restaurant'] | 281474976710658 | ATTRACTION | Yes          | 281474976710656 | ['Airport']      | edge     |
#     +----+-----------------+----------------+-----------------+------------+--------------+-----------------+------------------+----------+
