In [None]:

#  References for this Notebook,
#
#     Lines 135-139
#     https://github.com/KatanaGraph/solutions/blob/main/fsi/src/katana_ai.py
#     
#     Lines 31-37
#     https://github.com/KatanaGraph/test-datasets/blob/450232fe5739f327ed0795030a4b9d05731586ee/rdg_datasets/two_self_loops_ai/add_features.py



In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


In [None]:

#  When using UDFs, these execute on another host.
#
#  As such, these methods will need to be copied and run locally also.
#
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)


#  Setup stuff: Connectivity

In [None]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


# Working with num_partitions()

In [None]:
#  Even though this graph was made with 3 partitions, and even though certain operations
#  require at least 3 partitions else they fail, you may still see num_partitions = None
#  which can happen on really small graphs, and produce unexpected results below.

print(my_graph.num_partitions)


In [None]:

my_graph.repartition(num_partitions = 3)

print("--")


# UDFs, Part 01: Hello World, Nodes

In [None]:

#  This function will run on every worker node that serves a partition of the current graph.
#
#  If you rerun this function, you will see that the file access time is updated; same PID.

def my_func():
    
   import os
      #
   from pathlib import Path

   l_file = "/tmp/01." + str(os.getpid()) + ".txt"
   print(l_file)
       #
   Path(l_file).touch()
    
   return


#  run_result() will not display stdout or errors-
#
#     **  run_result() went away
#
l_result = my_graph.run(lambda g: my_func())


#  Sample output, num_paritions actually = 3
#
#     Host 0 output:  /tmp/01.22481.txt
#     Host 1 output:  /tmp/01.22958.txt
#     Host 2 output:  /tmp/01.22458.txt
#
#  Sample output, num_paritions = None
#
#     Host 0 output:
#     /tmp/01.3158.txt


#  See also,
#
#     return distributed.single_host(host=0, result=False)


In [None]:
# ##################################################################

In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Wahh ?

def my_func(i_graph, i_property):
    
   for l_node_prop in i_graph.nodes.get_property(i_property):
      print(l_node_prop)
            
   return

l_result = my_graph.run(lambda g: my_func(g, "id"))

#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     DEN
#     ORD
#     SJC
#     
#     Host 1 output:
#     MKE
#     ORD
#     PAP
#     
#     Host 2 output:
#     ORD
#     SJC
#     DEN
#     MKE
#     PAP


In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Masters only

def my_func(i_graph, i_property):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   # for l_node in i_graph.nodes.mirrors():                     #  Object not found
   for l_node in i_graph.nodes.masters():
      print(l_node_prop[l_node])
             
   return

l_result = my_graph.run(lambda g: my_func(g, "id"))


#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     DEN
#     
#     Host 1 output:
#     MKE
#     PAP
#     
#     Host 2 output:
#     ORD
#     SJC


In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Labels only

def my_func(i_graph, i_property, i_label):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   for l_node in i_graph.nodes(i_label):
      print(l_node_prop[l_node])
             
   return

l_result = my_graph.run(lambda g: my_func(g, "id", "Airport"))

#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     SJC
#     ORD
#     DEN
#     
#     Host 1 output:
#     ORD
#     MKE
#     
#     Host 2 output:
#     MKE
#     DEN
#     ORD
#     SJC


In [None]:
# ##################################################################

In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Filtering both; Masters and Labels

def my_func(i_graph, i_property, i_label):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   for l_node in i_graph.nodes(i_label).masters():
      print(l_node_prop[l_node])
             
   return

l_result = my_graph.run(lambda g: my_func(g, "id", "Airport"))

#  Sample output, num_partitions = 3
#
#  Does not work
#    AttributeError: 'katana_enterprise.distributed_native.EntitySequenc' object has no attribute 'masters'



In [None]:

#  Working with a property  (Airports and Restaurants)
#
#  Adding, Masters and Labels; alternate form to the above

def my_func(i_graph, i_property, i_label):
    
   l_node_prop = i_graph.nodes.get_property(i_property)
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print(l_node_prop[l_node])
     
   return

l_result = my_graph.run(lambda g: my_func(g, "id", "Airport"))

#  Sample output, num_partitions = 3
#
#     Host 0 output:
#     DEN
#     
#     Host 1 output:
#     MKE
#     
#     Host 2 output:
#     ORD
#     SJC


In [None]:
# ##################################################################

In [None]:

#  Working with multiple properties; approach 1

def my_func(i_graph, i_label, i_properties):
    
    
   l_node_prop0 = i_graph.nodes.get_property(i_properties[0])
   l_node_prop1 = i_graph.nodes.get_property(i_properties[1])
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print("Given Airport : %3s, %-32s" % (l_node_prop0[l_node], l_node_prop1[l_node]))

   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", ["id", "airport_name"]))

#  Sample output
#
#     Host 0 output:
#     Given Airport : DEN, Denver                          
#     
#     Host 1 output:
#     Given Airport : MKE, Milwaukee                       
#     
#     Host 2 output:
#     Given Airport : ORD, Chicago O-Hare                  
#     Given Airport : SJC, San Jose                        


In [None]:

#  Working with multiple properties; approach 2

def my_func(i_graph, i_label, i_properties):
    
   l_node_props = {each: i_graph.nodes.get_property(each) for each in i_properties}
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print("Given Airport : %3s, %-32s" % (l_node_props[i_properties[0]][l_node], l_node_props[i_properties[1]][l_node]))

   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", ["id", "airport_name"]))

#  Sample output
#
#     Host 0 output:
#     Given Airport : DEN, Denver                          
#     
#     Host 1 output:
#     Given Airport : MKE, Milwaukee                       
#     
#     Host 2 output:
#     Given Airport : ORD, Chicago O-Hare                  
#     Given Airport : SJC, San Jose                        


# Using help() on server side objects

In [None]:

def my_func(i_graph):
    
   help(i_graph.nodes.labels)
      #
   return

l_result = my_graph.run(lambda g: my_func(g))


# Background: Nested Python functions ..

In [None]:

#  Our code will be getting larger; demonstrate using nested functions with Python

def my_func_HHH(i_graph, i_properties):

   def bbb():
      print("HHH")
    
   print("GGG")
   bbb()
        
   return


l_result = my_graph.run(lambda g: my_func_HHH(g, ["id", "fname", "lname"]))
   #
print(l_result)


#  Sample output,
#
#     Host 0 output:
#     GGG
#     HHH
#     
#     Host 1 output:
#     GGG
#     HHH
#     
#     Host 2 output:
#     GGG
#     HHH
#     None


# Using projections

In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


In [None]:

# my_graph2 = my_graph.project(node_types=["Airport"], edge_types=[None])
my_graph2 = my_graph.project(node_types=["Airport"], edge_types=["FLIES_TO"])

print("--")


In [None]:

display(my_graph2.num_nodes())
display(my_graph2.num_edges())

#  OperationError: 9AgHAAGCdiiUykiKth3cm2rJW4ruWxpLTaf5ofuQkVQx-2dhBp6fa5UWarBkC7 
#  Projected views are not supported for query operation yet. 
#  (QueryOperation.cpp:30): not implemented: not implemented


In [None]:
#  View the graph; verify results

l_result = my_graph2.query("""

   MATCH (n)
   RETURN n
   
   """, contextualize=True)

l_result.view()

#  OperationError: Projected views are not supported for query operation yet. 
#  (QueryOperation.cpp:28): not implemented: not implemented


In [None]:

#  Working with a property, after a projection

def my_func(i_graph, i_property):
    
   for l_node in i_graph.nodes.get_property(i_property):
      print(l_node)
            
   return

l_result = my_graph2.run(lambda g: my_func(g, "id"))

#  Projection not used
#
#     Host 0 output:
#     DEN
#     ORD
#     SJC
#     
#     Host 1 output:
#     MKE
#     ORD
#     PAP
#     
#     Host 2 output:
#     ORD
#     SJC
#     DEN
#     MKE
#     PAP


In [None]:

#  Put the projection inside the UDF


def my_func(i_graph, i_property):
    
   l_graph2 = i_graph.project(node_types=["Airport"], edge_types=["FLIES_TO"])

   for l_node in l_graph2.nodes.get_property(i_property):
      print(l_node)
            
   return
    
    
l_result = my_graph.run(lambda g: my_func(g, "id"))

#  Projection not used
#
#  (Same output as above)


# UDFs, Part 02: Hello World, Edges

In [None]:

#  Working with edge data; approach 1
#
#     .  Just get something working
#

def my_func(i_graph, i_label):
    
   l_node_props = i_graph.nodes.    get_property("id"      )
   l_edge_props = i_graph.out_edges.get_property("DISTANCE")
       #
   for l_node in i_graph.nodes(i_label):
      for l_edge in i_graph.out_edges(l_node):
         l_node_id       = l_node_props[l_node]
            #
         l_edge_src      = i_graph.out_edges.src(l_edge)
         l_edge_dst      = i_graph.out_edges.dst(l_edge)
            #
         l_edge_distance = l_edge_props[l_edge]
            #
         print("Given Airport: %-3s   Edge Src: %-3s / %-18s   Edge Dst: %-3s / %-18s   Distance: %-8s" % (
            l_node_id, l_node_props[l_edge_src], l_edge_src, l_node_props[l_edge_dst], l_edge_dst, l_edge_distance))
    
   return

l_result = my_graph.run(lambda g: my_func(g, "Airport"))

#  Sample output,
#
#     Host 0 output:
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: SJC / <lnid 4294967293>    Distance: 948     
#     Given Airport: DEN   Edge Src: DEN / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 886     
#     
#     Host 1 output:
#     Given Airport: MKE   Edge Src: MKE / <lnid 0>             Edge Dst: ORD / <lnid 4294967294>    Distance: 66      
#     
#     Host 2 output:
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: MKE / <lnid 4294967293>    Distance: 66      
#     Given Airport: ORD   Edge Src: ORD / <lnid 0>             Edge Dst: DEN / <lnid 4294967294>    Distance: 886     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: ORD / <lnid 0>             Distance: 1829    
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: DEN / <lnid 4294967294>    Distance: 948     
#     Given Airport: SJC   Edge Src: SJC / <lnid 1>             Edge Dst: PAP / <lnid 4294967292>    Distance: None    

#  Above,
#
#     .  Minimally, we show the out edge from SJC Airport to a non-Airport node.
#        This is because we did filter on the Airport label, but looked at all out edges.
#
#     .  Other optimizations


In [None]:

def my_func(i_graph, i_label):
    
    
    
   for l_node in i_graph.masters():
    
    
    
      for l_edge in i_graph.out_edges(l_node):
         l_node_id       = l_node_props[l_node]
            #
         l_edge_src      = i_graph.out_edges.src(l_edge)
         l_edge_dst      = i_graph.out_edges.dst(l_edge)
            #
         l_edge_distance = l_edge_props[l_edge]
            #
         print("Given Airport: %-3s   Edge Src: %-3s / %-18s   Edge Dst: %-3s / %-18s   Distance: %-8s" % (
            l_node_id, l_node_props[l_edge_src], l_edge_src, l_node_props[l_edge_dst], l_edge_dst, l_edge_distance))
    
   return

l_result = my_graph.run(lambda g: my_func(g, "Airport"))




In [None]:

#  Working with edge data; approach 2
#

def my_func(i_graph, i_label):
    
   l_node_props = i_graph.nodes.    get_property("id"      )
   l_edge_props = i_graph.out_edges.get_property("DISTANCE")
       #
   for l_node in i_graph.nodes(i_label):
      for l_edge in i_graph.out_edges(l_node):
         l_node_id       = l_node_props[l_node]
            #
         l_edge_src      = i_graph.out_edges.src(l_edge)
         l_edge_dst      = i_graph.out_edges.dst(l_edge)
            #
         l_edge_distance = l_edge_props[l_edge]
            #
         print("Given Airport: %-3s   Edge Src: %-3s / %-18s   Edge Dst: %-3s / %-18s   Distance: %-8s" % (
            l_node_id, l_node_props[l_edge_src], l_edge_src, l_node_props[l_edge_dst], l_edge_dst, l_edge_distance))
    
   return

l_result = my_graph.run(lambda g: my_func(g, "Airport"))










In [None]:





def my_func(i_graph, i_label):
    
   l_properties = {each: i_graph.nodes.get_property(each) for each in i_properties}
      #
   for l_node in i_graph.nodes.masters():
      if (i_label in i_graph.nodes.labels(l_node) ):
         print("Given Airport : %3s, %-32s" % (l_properties[i_properties[0]][l_node], l_properties[i_properties[1]][l_node]))

   return

l_result = my_graph.run(lambda g: my_func(g, "Airport", ["id", "airport_name"]))




In [None]:








def my_func(i_graph, i_property):
    
   np = i_graph.nodes.get_property("airport_name")
   ep = i_graph.out_edges.get_property("DISTANCE")
    
    
   for n in i_graph.nodes("Airport"):
      np_value = np[n]
      for e in i_graph.out_edges(n):
         ep_value = ep[e]
         print(ep_value)
    
   return

l_result = my_graph.run(lambda g: my_func(g, "DISTANCE"))

#  No output from above





#  UDFs, Part 03: Multiple properties ..

In [None]:

#  Write results to a file
#  Also, return results to calling function


def my_func_EEE(i_graph, i_property):
    
   from katana_enterprise.distributed import single_host


   l_result = []
      #
   l_property_ids = i_graph.get_node_property(i_property)
    
   with open("/tmp/EEE.txt", "w") as f:
          #
      for l_node in i_graph.master_nodes():
         l_value = str(l_property_ids[l_node])
            #
         print("PPP: " + l_value)
            #
         l_result.append(l_value)
         f.write("PPP: " + l_value + "\n")
            
            
   #  ValueError: Hosts returned more than one value. Please have only one host return a value while other's return a None.
   #
   # return l_result
  
   #  host defaults to zero
   #
   return single_host(host=0, result=l_result)



l_result = my_graph.run(lambda g: my_func_EEE(g, "fname"))
   #
print(l_result)


#  This file is written on all worker nodes. The contents of the file will differ
#  based on what is local to said worker.
#
#  Sample output,
#
#     (base) root@katana-compute-0:/tmp# cat EE*
#     PPP: None
#     PPP: None
#     PPP: None
#     PPP: None
#     PPP: None
#     PPP: None
#     
#     (base) root@katana-compute-1:/tmp# cat EE*
#     
#     (base) root@katana-compute-2:/tmp# cat EE*
#     PPP: Justin
#     PPP: Thomas
#     PPP: Sameer
#     PPP: Brian


In [None]:

#  Working with multiple property keys, better technique
#
#     .  get_node_property()                -->  katana.local_native.NodeKeyedColumn
#     .  to_numpy()                         -->  numpy.ndarray

def my_func_GGG(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   l_nodes = []
    
   for l_property in i_properties:
      #
      #  The reshape() flattens the array
      #
      l_node = i_graph.get_node_property(l_property).to_numpy().reshape(-1, 1)
      l_nodes.append(l_node)
    

   l_array     = np.hstack(l_nodes)
   l_dataframe = pd.DataFrame(l_array)
    
        
   return single_host(host=0, result=l_dataframe)


l_result = my_graph.run(lambda g: my_func_GGG(g, ["id", "fname", "lname"]))
   #
print(l_result)


#  l_nodes looks like,
#
#     Host 0 output:
#     [array([['1111-1111-1111-1111'],
#            ['2222-2222-2222-2222'],
#            ['3333-3333-3333-3333'],
#            ['4444-4444-4444-4444'],
#            ['101'],
#            ['102'],
#            ['103'],
#            ['104'],
#            ['105'],
#            ['106']], dtype=object), array([['Justin'],
#            ['Thomas'],
#            ['Sameer'],
#            ['Brian'],
#            [None],
#            [None],
#            [None],
#            [None],
#            [None],
#            [None]], dtype=object), array([['Fine'],
#            ['Cook'],
#            ['Iyengar'],
#            ['Spencer'],
#            [None],
#            [None],
#            [None],
#            [None],
#            [None],
#            [None]], dtype=object)]
#     
#     Host 1 output:
#     [array([['101'],
#            ['102'],
#            ['103'],
#            ['104'],
#            ['105'],
#            ['106']], dtype=object), array([[None],
#            [None],
#            [None],
#            [None],
#            [None],
#            [None]], dtype=object), array([[None],
#            [None],
#            [None],
#            [None],
#            [None],
#            [None]], dtype=object)]
#     
#     Host 2 output:
#     [array([], shape=(0, 1), dtype=object), array([], shape=(0, 1), dtype=object), array([], shape=(0, 1), dtype=object)]
#
#
#  l_array looks like,
#
#     Host 0 output:
#     [['1111-1111-1111-1111' 'Justin' 'Fine']
#      ['2222-2222-2222-2222' 'Thomas' 'Cook']
#      ['3333-3333-3333-3333' 'Sameer' 'Iyengar']
#      ['4444-4444-4444-4444' 'Brian' 'Spencer']
#      ['101' None None]
#      ['102' None None]
#      ['103' None None]
#      ['104' None None]
#      ['105' None None]
#      ['106' None None]]
#     
#     Host 1 output:
#     [['101' None None]
#      ['102' None None]
#      ['103' None None]
#      ['104' None None]
#      ['105' None None]
#      ['106' None None]]
#     
#     Host 2 output:
#     []
    
    

#  UDFs, Part 04: Data enrichment/derivation ..

In [None]:

#  Same as above, but add a derived column


def my_func_III(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   ##################################################################

   #  Nested function, load a Python DataFrame from the KG graph
    
   def l_load_df(i_graph, i_properties):
      l_nodes = []
    
      for l_property in i_properties:
         l_node = i_graph.get_node_property(l_property).to_numpy().reshape(-1, 1)
         l_nodes.append(l_node)
       
      l_array     = np.hstack(l_nodes)
      l_dataframe = pd.DataFrame(l_array, columns=i_properties)
         #
    
      return l_dataframe
    
   ##################################################################

   #  Nested function, add a new, derived column to the DataFrame
    
   def l_enrich_df(i_dataframe, i_source_property, i_new_property):
    
      def l_to_upper(i_str):
         if i_str.get(i_source_property) is not None:
            return str(i_str[i_source_property]).upper()
         else:
            return
    
      i_dataframe[i_new_property] = i_dataframe.apply(l_to_upper, axis=1)
    
      return

   ##################################################################
    
    
   l_dataframe = l_load_df(i_graph, i_properties)
      #
   l_enrich_df(l_dataframe, "fname", "fname_upper")

        
   return single_host(host=0, result=l_dataframe)


l_result = my_graph.run(lambda g: my_func_III(g, ["id", "fname", "lname"]))
   #
print(l_result)


#  Sample output,
#
#                         id   fname    lname fname_upper
#     0  1111-1111-1111-1111  Justin     Fine      JUSTIN
#     1  2222-2222-2222-2222  Thomas     Cook      THOMAS
#     2  3333-3333-3333-3333  Sameer  Iyengar      SAMEER
#     3  4444-4444-4444-4444   Brian  Spencer       BRIAN
#     4                  101    None     None        None
#     5                  102    None     None        None
#     6                  103    None     None        None
#     7                  104    None     None        None
#     8                  105    None     None        None
#     9                  106    None     None        None


# UDFs, Part 05: Just writing to the graph ..

In [None]:

def my_func_JJJ(i_graph):

   import pandas as pd
       
   l_array     = [
      [ "7777-7777-7777-7777", "Bob"     , "Person" ],
      [ "8888-8888-8888-8888", "Mary"    , "Person" ],
      [ "9999-9999-9999-9999", "Earl"    , "Person" ],
      ]
   l_dataframe = pd.DataFrame(l_array, columns=["id", "newcol", "type"])
    
   i_graph.upsert_node_property(l_dataframe)

    
my_graph.run(lambda g: my_func_JJJ(g))


#     Host 0 errors:
#           ...
#     ValueError: expected 10 rows found 3 instead (PropertyGraph.cpp:1282): invalid argument
#     
#     Host 1 errors:
#           ...
#     ValueError: expected 6 rows found 3 instead (PropertyGraph.cpp:1282): invalid argument
#     
#     Host 2 errors:
#           ...
#     ValueError: expected 0 rows found 3 instead (PropertyGraph.cpp:1282): invalid argument



In [None]:

#  Attempt 01 to match what is expected by upsert_node_property()

def my_func_KKK(i_graph):

   l_node_ids = i_graph.get_node_property("id")


   for l_node in i_graph.master_nodes():
      print(l_node)

    
my_graph.run(lambda g: my_func_KKK(g))


#     Host 0 output:
#     0
#     1
#     2
#     3
#     
#     Host 1 output:
#     0
#     1
#     2
#     3
#     4
#     5



In [None]:

#  Attempt 02 to match what is expected by upsert_node_property()

def my_func_LLL(i_graph):

   l_node_ids = i_graph.get_node_property("id")


   for l_node in i_graph.master_nodes():
      print(l_node)
   for l_node in i_graph.mirror_nodes():
      print(l_node)
    
    
my_graph.run(lambda g: my_func_LLL(g))


#     Host 0 output:
#     0
#     1
#     2
#     3
#     4
#     5
#     6
#     7
#     8
#     9
#     
#     Host 1 output:
#     0
#     1
#     2
#     3
#     4
#     5



In [None]:

#  This technique works differently than that above because we fetch the property titled, "id", which is present everywhere.
#
#  **  Arthur

def my_func_MMM(i_graph, i_properties):
    
   import numpy  as np
   import pandas as pd
    
   pd.set_option("display.max_columns", None)
   pd.set_option("max_colwidth", None)
    
   l_nodes = []
 
   for l_property in i_properties:
      l_node = i_graph.get_node_property(l_property).to_numpy().reshape(-1, 1)
      l_nodes.append(l_node)
    
   l_array     = np.hstack(l_nodes)
   l_dataframe = pd.DataFrame(l_array, columns=i_properties)
    
   
   #  todo-   Want better example
   #          Also, one that only affects given rows
   #
   # xxx = l_dataframe.assign(new_column=lambda x: (None if x.fname is None else "Mr. " + x.fname))

   xxx = l_dataframe.assign(new_column=lambda x: (x.fname))
    
   print(xxx)

   #  This statement returns, but I see no changes to the graph.
   #
   i_graph.upsert_node_property(xxx)
    
   #  This statement hangs
   #
   i_graph.write()
    
    
my_graph.run(lambda g: my_func_MMM(g, ["id", "fname"]))
    
    
    
    

In [None]:

#  View the graph; verify results

l_result = my_graph.query("""

   MATCH (n)
   RETURN n
   
   """, contextualize=True)

l_result.view()



#  Got this error once

OperationError: backtrace (QueryClient.cpp:517): backtrace (QueryClient.cpp:447): backtrace (GraphLifetime.cpp:251): loading graph for querying (GraphLifetime.cpp:284): backtrace (RDGLoader.h:43): backtrace (RDGLoader.cpp:446): backtrace (RDGLoader.cpp:510): loading: gs://katana-internal46/graph/CdqjEWNX4MSEbZqXTvwijPDz92AeTe7n4AxYYEQZ4x8m/katana_vers00000000000000000002_rdg-retained-retained-part3.manifest (RDGLoader.cpp:358): loading found view for M:N partitioning (RDGLoader.cpp:143): making AggregatedGraphView (BlockedGraphView.cpp:1267): backtrace (AggregatedGraphView.cpp:626): backtrace (AggregatedGraphView.cpp:547): node property schemas from all views must match (AggregatedGraphView.cpp:517): assertion failed: assertion failed

In [None]:


from katana_enterprise import distributed
from katana.distributed import Graph

import pyarrow
import numpy as np

distributed.initialize()


features_numpy = np.array([[1, 1], [2, 2]], np.float32)
features_numpy = np.ascontiguousarray(features_numpy)
   #
print(features_numpy)

#   # dist prop graph
#   print("Loading RDG", flush=True)
#   original_graph = Graph("two_self_loops_ai")
#   
#   print("rdg loaded",flush=True)
#   

# number_of_nodes = original_graph.num_nodes()
number_of_nodes = 10


#   assert number_of_nodes == features_numpy.shape[0]
#   
#   # the total number of bytes in a feature vector of a node

pa_type = pyarrow.binary(features_numpy.dtype.itemsize * features_numpy.shape[1])
   #
print(pa_type)

arrow_buffer = pyarrow.py_buffer(features_numpy.data)
buffers = [None, arrow_buffer]
   #
print(arrow_buffer)
print(buffers)


#   # creates pyarrow wrapper over the numpy array

pyarrow_array = pyarrow.Array.from_buffers(pa_type, number_of_nodes, buffers=buffers)
   #
print(pyarrow_array)



#   
#   print("created pyarrow array",flush=True)
#   
#   # to table

#  table = pyarrow.Table.from_arrays([pyarrow_array], ["feature"])




#   # save to in-memory graph
#   original_graph.upsert_node_property(table)
#   
#   print("Begin write via commit",flush=True)
#   # commit: this is an in place operation
#   original_graph.write()



In [None]:

def my_func_JJJ(i_graph, i_properties):
    
    
   from katana_enterprise.distributed import single_host
      #
   import numpy  as np
   import pandas as pd
    

   ##################################################################

   #  Nested function, load a Python DataFrame from the KG graph
    
   def l_load_df(i_graph, i_properties):
      l_nodes = []
    
      for l_property in i_properties:
         l_node = i_graph.get_node_property(l_property).to_numpy().reshape(-1, 1)
         l_nodes.append(l_node)
       
      l_array     = np.hstack(l_nodes)
      l_dataframe = pd.DataFrame(l_array, columns=i_properties)
         #
    
      return l_dataframe
    
   ##################################################################

   #  Nested function, add a new, derived column to the DataFrame
    
   def l_enrich_df(i_dataframe, i_source_property, i_new_property):
    
      def l_to_upper(i_str):
         if i_str.get(i_source_property) is not None:
            return str(i_str[i_source_property]).upper()
         else:
            return
    
      i_dataframe[i_new_property] = i_dataframe.apply(l_to_upper, axis=1)
    
      return

   ##################################################################
    
   def l_save_df(i_graph, i_dataframe):
    
      # i_graph.upsert_node_property(i_dataframe)

    
    
   ##################################################################
    
   l_dataframe = l_load_df(i_graph, i_properties)
      #
   l_enrich_df(l_dataframe, "fname", "fname_upper")
      #
   l_save_df(i_graph, l_dataframe)

        
   return single_host(host=0, result=l_dataframe)


l_result = my_graph.run(lambda g: my_func_JJJ(g, ["id", "fname", "lname"]))
   #
print(l_result)

