#  Display options:

In [None]:

#  References for this Notebook,
#
#     Lines 135-139
#     https://github.com/KatanaGraph/solutions/blob/main/fsi/src/katana_ai.py
#     Lines 31-37
#     https://github.com/KatanaGraph/test-datasets/blob/450232fe5739f327ed0795030a4b9d05731586ee/rdg_datasets/two_self_loops_ai/add_features.py
#
#     https://docs.k9h.dev/latest/recipes/udf-recipes.html?highlight=nodes%20get_property
#
#     https://github.com/KatanaGraph/katana-enterprise/blob/master/python/test/integration/remote/remote_operations_test.py
#
#     https://github.com/KatanaGraph/katana-enterprise/tree/master/lonestar/analytics/distributed/experimental



In [5]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


In [None]:

#  When using UDFs, these execute on another host.
#
#  As such, these methods will need to be copied and run locally also.
#
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)


#  Setup stuff: Connectivity

In [6]:

from katana import remote
#  from katana.remote import import_data

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f06bb62ebe0>


In [7]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


--


In [15]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


<_Graph my_graph, 2XV5TdCuQpgkTgwvDSbEwxZpqvpYh1chH5z5X3Uw7suh, 4>


In [9]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


          0/? [?op/s]

5

          0/? [?op/s]

8

# UDFs, Part 01: Mutating Nodes 

In [19]:

#  We will use this counter to generate a unique column name below-
#
l_cntr = 20

print("--")


--


In [20]:

#  Add a new column to each node, one that is derived from columns local to each node; approach 1
#
#     .  This will fail if the named new column already exists in the graph.
#     .  Just get this working, then make it more functional.
#

def my_func(i_graph, i_label, i_properties, i_new_colname):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = i_graph.nodes.get_property(i_properties[0])                     #  An existing column, we will upshift this existing value
      #
   l_node_keys  = l_node_props.keys()                                             #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props), dtype="object")                     #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                  #  Loop over the keys that are already in the graph
      l_new_cols[l_index] = str(l_node_props[l_key]).upper()                      #  Derive a new column property value

    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props, i_new_colname)        #  Building what we need to send to add_property()
   i_graph.nodes.add_property(l_keyed_cols)


   return

l_cntr   += 1
l_newcol =  "newcol_" + str(l_cntr).zfill(4)

   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", ["airport_name"], l_newcol))


print("--")


          0/? [?op/s]

--


In [23]:

#  Add a new column to each node, one that is derived from columns local to each node; approach 2
#
#     .  We moved properties to our comprehension technique. Why ?  When dealing with multiple
#        properties, this is handy.
#

def my_func(i_graph, i_label, i_properties, i_new_colname):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}                 #  An existing column, we will upshift this existing value
      #
   l_node_keys  = l_node_props[i_properties[0]].keys()                                              #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props[i_properties[0]]), dtype="object")                      #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                                    #  Loop over the keys that are already in the graph
      l_new_cols[l_index] = str(l_node_props[i_properties[0]][l_key]).upper()                       #  Derive a new column property value

    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props[i_properties[0]], i_new_colname)         #  Building what we need to send to add_property()
      #
   i_graph.nodes.add_property(l_keyed_cols)


   return

l_cntr   += 1
l_newcol =  "newcol_" + str(l_cntr).zfill(4)

   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", ["airport_name"], l_newcol))


print("--")


          0/? [?op/s]

--


In [32]:

#  View the graph; verify results
#
#  Based on our return list, not easily certain if new column was added to Restaurant
#

def f_printtable1(i_colname):
    
   l_query  = """
      MATCH (n)
      RETURN n.restaurant_name, n.airport_name, n.{0}
      """.format(i_colname)
   
   l_result = my_graph.query(l_query)
   
   print(tabulate(l_result, headers='keys', tablefmt='psql'))


f_printtable1(l_newcol)

#  Sample output,
#
#     +----+-----------------------------+------------------+-----------------+
#     |    | n.restaurant_name           | n.airport_name   | n.newcol_0021   |
#     |----+-----------------------------+------------------+-----------------|
#     |  0 |                             | San Jose         | SAN JOSE        |
#     |  1 |                             | Chicago O-Hare   | CHICAGO O-HARE  |
#     |  2 |                             | Milwaukee        | MILWAUKEE       |
#     |  3 | Pappadeauxs Seafood Kitchen |                  | NONE            |
#     |  4 |                             | Denver           | DENVER          |
#     +----+-----------------------------+------------------+-----------------+


          0/? [?op/s]

+----+-----------------------------+------------------+-----------------+
|    | n.restaurant_name           | n.airport_name   | n.newcol_0024   |
|----+-----------------------------+------------------+-----------------|
|  0 | Pappadeauxs Seafood Kitchen |                  |                 |
|  1 |                             | San Jose         | SAN JOSE        |
|  2 |                             | Chicago O-Hare   | CHICAGO O-HARE  |
|  3 |                             | Milwaukee        | MILWAUKEE       |
|  4 |                             | Denver           | DENVER          |
+----+-----------------------------+------------------+-----------------+


In [33]:

#  View the graph; verify results
#
#  Confirming new column was added to Restaurant
#

def f_printtable2(i_colname):
    
   l_query  = """
      MATCH (n: Restaurant)
      // RETURN n.restaurant_name, n.{0}
      RETURN n
      """.format(i_colname)
   
   l_result = my_graph.query(l_query)
   
   print(tabulate(l_result, headers='keys', tablefmt='psql'))


f_printtable2(l_newcol)

#  Sample output,
#
#     +----+-----------------+----------------+--------+-----------------+-----------------------------+----------+
#     |    |   n.internal_id | n.labels       | n.id   | n.newcol_0021   | n.restaurant_name           | n.type   |
#     |----+-----------------+----------------+--------+-----------------+-----------------------------+----------|
#     |  0 | 562949953421313 | ['Restaurant'] | PAP    | NONE            | Pappadeauxs Seafood Kitchen | node     |
#     +----+-----------------+----------------+--------+-----------------+-----------------------------+----------+


          0/? [?op/s]

+----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+
|    |   n.internal_id | n.labels       | n.id   | n.newcol_0021   | n.newcol_0022   | n.newcol_0023   | n.restaurant_name           | n.type   |
|----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------|
|  0 |               2 | ['Restaurant'] | PAP    | NONE            | NONE            | NONE            | Pappadeauxs Seafood Kitchen | node     |
+----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+


In [35]:

#  Add a new column to each node, one that is derived from columns local to each node; approach 3
#
#     .  Here we see that if the new column value is None, then the property is not actually added
#        to the node. 
#

def my_func(i_graph, i_label, i_properties, i_new_colname):
    
   from katana.distributed import KeyedColumnNode
   import numpy as np
    
    
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}                 #  An existing column, we will upshift this existing value
      #
   l_node_keys  = l_node_props[i_properties[0]].keys()                                              #  keys() works against any column in the node.
      #
   l_new_cols   = np.zeros(len(l_node_props[i_properties[0]]), dtype="object")                      #  An empty NumPy array, same length as l_node_props


   for l_index, l_key in enumerate(l_node_keys):                                                    #  Loop over the keys that are already in the graph
      if (i_label in i_graph.nodes.labels(l_key) ):                                                   
         #  Is Airport
         if (str(l_node_props[i_properties[0]][l_key]).upper() < "M"):                              #  Further testing what we can set to None
            l_new_cols[l_index] = str(l_node_props[i_properties[0]][l_key]).upper()
         else:
            l_new_cols[l_index] = None
      else:
         #  Not Airport
         l_new_cols[l_index] = None
    
   l_keyed_cols = KeyedColumnNode(l_new_cols, l_node_props[i_properties[0]], i_new_colname)         #  Building what we need to send to add_property()
      #
   i_graph.nodes.add_property(l_keyed_cols)


   return

l_cntr   += 1
l_newcol =  "newcol_" + str(l_cntr).zfill(4)

   #
l_result = my_graph.run(lambda g: my_func(g, "Airport", ["airport_name"], l_newcol))

print("--")


          0/? [?op/s]

          0/? [?op/s]

+----+-----------------------------+------------------+-----------------+
|    | n.restaurant_name           | n.airport_name   | n.newcol_0026   |
|----+-----------------------------+------------------+-----------------|
|  0 |                             | San Jose         |                 |
|  1 |                             | Chicago O-Hare   | CHICAGO O-HARE  |
|  2 | Pappadeauxs Seafood Kitchen |                  |                 |
|  3 |                             | Milwaukee        |                 |
|  4 |                             | Denver           | DENVER          |
+----+-----------------------------+------------------+-----------------+


          0/? [?op/s]

+----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+
|    |   n.internal_id | n.labels       | n.id   | n.newcol_0021   | n.newcol_0022   | n.newcol_0023   | n.restaurant_name           | n.type   |
|----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------|
|  0 |               2 | ['Restaurant'] | PAP    | NONE            | NONE            | NONE            | Pappadeauxs Seafood Kitchen | node     |
+----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+
--


In [39]:

#  Checking results above
#

f_printtable1(l_newcol)
f_printtable2(l_newcol)

#  Here 
l_query  = """
   MATCH (n: Airport)
   WHERE n.airport_name > "M"
   RETURN n
   """.format(l_newcol)

l_result = my_graph.query(l_query)

print(tabulate(l_result, headers='keys', tablefmt='psql'))






          0/? [?op/s]

+----+-----------------------------+------------------+-----------------+
|    | n.restaurant_name           | n.airport_name   | n.newcol_0026   |
|----+-----------------------------+------------------+-----------------|
|  0 |                             | San Jose         |                 |
|  1 |                             | Chicago O-Hare   | CHICAGO O-HARE  |
|  2 | Pappadeauxs Seafood Kitchen |                  |                 |
|  3 |                             | Milwaukee        |                 |
|  4 |                             | Denver           | DENVER          |
+----+-----------------------------+------------------+-----------------+


          0/? [?op/s]

+----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+
|    |   n.internal_id | n.labels       | n.id   | n.newcol_0021   | n.newcol_0022   | n.newcol_0023   | n.restaurant_name           | n.type   |
|----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------|
|  0 |               2 | ['Restaurant'] | PAP    | NONE            | NONE            | NONE            | Pappadeauxs Seafood Kitchen | node     |
+----+-----------------+----------------+--------+-----------------+-----------------+-----------------+-----------------------------+----------+


          0/? [?op/s]

+----+-----------------+-------------+-----------+------------------+--------+-----------------+-----------------+-----------------+-----------------+----------+
|    |   n.internal_id | n.labels    | n.LABEL   | n.airport_name   | n.id   | n.newcol_0021   | n.newcol_0022   | n.newcol_0023   | n.newcol_0024   | n.type   |
|----+-----------------+-------------+-----------+------------------+--------+-----------------+-----------------+-----------------+-----------------+----------|
|  0 |               1 | ['Airport'] | Airport   | San Jose         | SJC    | SAN JOSE        | SAN JOSE        | SAN JOSE        | SAN JOSE        | node     |
|  1 | 281474976710656 | ['Airport'] | Airport   | Milwaukee        | MKE    | MILWAUKEE       | MILWAUKEE       | MILWAUKEE       | MILWAUKEE       | node     |
+----+-----------------+-------------+-----------+------------------+--------+-----------------+-----------------+-----------------+-----------------+----------+


#  UDFs, Part 04: Data enrichment/derivation ..

In [None]:

#  Same as above, but add a derived column


def my_func_III(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   ##################################################################

   #  Nested function, load a Python DataFrame from the KG graph
    
   def l_load_df(i_graph, i_properties):
      l_nodes = []
    
      for l_property in i_properties:
         l_node = i_graph.get_node_property(l_property).to_numpy().reshape(-1, 1)
         l_nodes.append(l_node)
       
      l_array     = np.hstack(l_nodes)
      l_dataframe = pd.DataFrame(l_array, columns=i_properties)
         #
    
      return l_dataframe
    
   ##################################################################

   #  Nested function, add a new, derived column to the DataFrame
    
   def l_enrich_df(i_dataframe, i_source_property, i_new_property):
    
      def l_to_upper(i_str):
         if i_str.get(i_source_property) is not None:
            return str(i_str[i_source_property]).upper()
         else:
            return
    
      i_dataframe[i_new_property] = i_dataframe.apply(l_to_upper, axis=1)
    
      return

   ##################################################################
    
    
   l_dataframe = l_load_df(i_graph, i_properties)
      #
   l_enrich_df(l_dataframe, "fname", "fname_upper")

        
   return single_host(host=0, result=l_dataframe)


l_result = my_graph.run(lambda g: my_func_III(g, ["id", "fname", "lname"]))
   #
print(l_result)


#  Sample output,
#
#                         id   fname    lname fname_upper
#     0  1111-1111-1111-1111  Justin     Fine      JUSTIN
#     1  2222-2222-2222-2222  Thomas     Cook      THOMAS
#     2  3333-3333-3333-3333  Sameer  Iyengar      SAMEER
#     3  4444-4444-4444-4444   Brian  Spencer       BRIAN
#     4                  101    None     None        None
#     5                  102    None     None        None
#     6                  103    None     None        None
#     7                  104    None     None        None
#     8                  105    None     None        None
#     9                  106    None     None        None


# UDFs, Part 05: Just writing to the graph ..

In [None]:

def my_func_JJJ(i_graph):

   import pandas as pd
       
   l_array     = [
      [ "7777-7777-7777-7777", "Bob"     , "Person" ],
      [ "8888-8888-8888-8888", "Mary"    , "Person" ],
      [ "9999-9999-9999-9999", "Earl"    , "Person" ],
      ]
   l_dataframe = pd.DataFrame(l_array, columns=["id", "newcol", "type"])
    
   i_graph.upsert_node_property(l_dataframe)

    
my_graph.run(lambda g: my_func_JJJ(g))


#     Host 0 errors:
#           ...
#     ValueError: expected 10 rows found 3 instead (PropertyGraph.cpp:1282): invalid argument
#     
#     Host 1 errors:
#           ...
#     ValueError: expected 6 rows found 3 instead (PropertyGraph.cpp:1282): invalid argument
#     
#     Host 2 errors:
#           ...
#     ValueError: expected 0 rows found 3 instead (PropertyGraph.cpp:1282): invalid argument



In [None]:

#  Attempt 01 to match what is expected by upsert_node_property()

def my_func_KKK(i_graph):

   l_node_ids = i_graph.get_node_property("id")


   for l_node in i_graph.master_nodes():
      print(l_node)

    
my_graph.run(lambda g: my_func_KKK(g))


#     Host 0 output:
#     0
#     1
#     2
#     3
#     
#     Host 1 output:
#     0
#     1
#     2
#     3
#     4
#     5



In [None]:

#  This technique works differently than that above because we fetch the property titled, "id", which is present everywhere.
#
#  **  Arthur

def my_func_MMM(i_graph, i_properties):
    
   import numpy  as np
   import pandas as pd
    
   pd.set_option("display.max_columns", None)
   pd.set_option("max_colwidth", None)
    
   l_nodes = []
 
   for l_property in i_properties:
      l_node = i_graph.get_node_property(l_property).to_numpy().reshape(-1, 1)
      l_nodes.append(l_node)
    
   l_array     = np.hstack(l_nodes)
   l_dataframe = pd.DataFrame(l_array, columns=i_properties)
    
   
   #  todo-   Want better example
   #          Also, one that only affects given rows
   #
   # xxx = l_dataframe.assign(new_column=lambda x: (None if x.fname is None else "Mr. " + x.fname))

   xxx = l_dataframe.assign(new_column=lambda x: (x.fname))
    
   print(xxx)

   #  This statement returns, but I see no changes to the graph.
   #
   i_graph.upsert_node_property(xxx)
    
   #  This statement hangs
   #
   i_graph.write()
    
    
my_graph.run(lambda g: my_func_MMM(g, ["id", "fname"]))
    
    
    
    

In [None]:

#  View the graph; verify results

l_result = my_graph.query("""

   MATCH (n)
   RETURN n
   
   """, contextualize=True)

l_result.view()


In [None]:

def my_func(i_graph, i_properties):
    
   import numpy as np
   from katana_enterprise.distributed import single_host


   l_return = np.random.randint(1, 101, 4)                #  Generate an array[4] of random numbers
   print(l_return)
    
   return single_host(host=0, result=l_return)



my_return = my_graph.run(lambda g: my_func(g, ["xxx", "yyy"]))
   #
display("--")
display(my_return)


#  Sample output,
#
#     Host 0 output:
#     [ 5 14 50 87]
#     
#     Host 1 output:
#     [15 50 81 79]
#     
#     Host 2 output:
#     [48 36 97 66]
#     
#     '--'
#     
#     array([ 5, 14, 50, 87])


In [None]:

def my_func_JJJ(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   ##################################################################

   #  Nested function, load a Python DataFrame from the KG graph
    
   def l_load_df(i_graph, i_properties):
      l_nodes = []
    
      for l_property in i_properties:
         l_node = i_graph.get_node_property(l_property).to_numpy().reshape(-1, 1)
         l_nodes.append(l_node)
       
      l_array     = np.hstack(l_nodes)
      l_dataframe = pd.DataFrame(l_array, columns=i_properties)
         #
    
      return l_dataframe
    
   ##################################################################

   #  Nested function, add a new, derived column to the DataFrame
    
   def l_enrich_df(i_dataframe, i_source_property, i_new_property):
    
      def l_to_upper(i_str):
         if i_str.get(i_source_property) is not None:
            return str(i_str[i_source_property]).upper()
         else:
            return
    
      i_dataframe[i_new_property] = i_dataframe.apply(l_to_upper, axis=1)
    
      return

   ##################################################################
    
   def l_save_df(i_graph, i_dataframe):
    
      # i_graph.upsert_node_property(i_dataframe)

    
    
   ##################################################################
    
   l_dataframe = l_load_df(i_graph, i_properties)
      #
   l_enrich_df(l_dataframe, "fname", "fname_upper")
      #
   l_save_df(i_graph, l_dataframe)

        
   return single_host(host=0, result=l_dataframe)


l_result = my_graph.run(lambda g: my_func_JJJ(g, ["id", "fname", "lname"]))
   #
print(l_result)

