#  Setup: Display options

In [1]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


--


# Setup: Connectivity

In [2]:

from katana import remote

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f3ab454ff10>


In [3]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


--


In [4]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


--


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [5]:

#  CREATE GRAPH
#
my_graph = my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, J4ugcL4QUXrzywMrEjaLkx6bRjAusQQgfrdkA8pFTdzm, 0>


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


# Step 01: Support Methods to Create CSV files ..

In [6]:

#  The overall goal of this NoteBook is to show incremental import. That is, importing
#  into the graph many smaller pieces versus perhaps running one single import operation.
#
#  As such, we'll create (n) DataFrames with Nodes, (m) with Edges, and import those.
#
#     (n) and (m) are determined by how we invoke the method below.
#

import pandas as pd
   #
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas


#  Generate a Dask DataFrame given a few column variables
#
def f_create_dataframe(i_node_or_edge, i_keyranges, i_label_or_type):

   l_data_array = []

   if (i_node_or_edge == "node"):
      #
      #  This is for nodes; we loop from i_keyranges[0] to i_keyranges[1]
      #
      for l_key in range(i_keyranges[0], i_keyranges[1] + 1):
         l_key_str = str(i_label_or_type + "_id").lower()
            #
         l_data_dict = { l_key_str: l_key, "col2": "222", "col3": "333", "col4": 444, "LABEL": i_label_or_type}
            #
         l_data_array.append(l_data_dict)
            #
      l_dataframe =  pd.DataFrame(l_data_array, columns = [l_key_str, "col2", "col3", "col4", "LABEL"])
   else:
      #
      #  This is for edges; here we expect five integers in i_keyranges[]
      #
      #     .  We (outer) loop from i_keyranges[0] to i_keyranges[1]
      #        This is for the left edge, and form the key range for a given
      #        set of nodes.
      #     .  For each record above, we loop (n) times, where n is equal
      #        to i_keyanges[2]. Call this the (inner) loop.
      #     .  The values produced in the inner loop are sequential from
      #        i_keyranges[3] to i_keyranges[4]
      #
      #     .  See examples in next cell
      #
      l_right_key = i_keyranges[3] - 1
         #
      for l_left_key in range(i_keyranges[0], i_keyranges[1] + 1):
         for l_each in range(1, i_keyranges[2] + 1):
            l_right_key +=1
               #
            if (l_right_key > i_keyranges[4]):
               l_right_key = i_keyranges[3]
                  #
            l_data_dict1 = { "start_id": l_left_key,  "end_id": l_right_key, "weight": 444, "uniq_col": str(l_each), "TYPE": i_label_or_type}
            l_data_dict2 = { "end_id": l_left_key,  "start_id": l_right_key, "weight": 444, "uniq_col": str(l_each), "TYPE": i_label_or_type}
               #
            l_data_array.append(l_data_dict1)
            l_data_array.append(l_data_dict2)
            #
         l_dataframe =  pd.DataFrame(l_data_array, columns = ["start_id", "end_id", "weight", "uniq_col", "TYPE"])

   l_dataframe_dd = from_pandas(l_dataframe, npartitions = NUM_PARTITIONS)


   return l_dataframe_dd


#  The name of the primary key column must be consistently named across
#  all nodes in the graph. Easy code, but place it in a method regardless.
#
def f_copy_id_col(i_dataframe, i_colname):
    
   #  If we wanted a simple column rename
   #
   #  l_return = i_dataframe.rename(columns={i_colname: "id"})
   #  return l_return
     
   i_dataframe["id"] = i_dataframe[i_colname].map(lambda x: x )

   return i_dataframe
    

print("--")
                             


--


In [None]:

#  Example demonstrating the above-
#

#  Create Person nodes in the range 100-103
#
df_nodes1 = f_create_dataframe("node", [100, 103], "Person")
   #                         
#  print(tabulate(df_nodes1, headers='keys', tablefmt='psql'))
print(df_nodes1.compute())


#  Create Flight nodes in the range 400-410
#
df_nodes2 = f_create_dataframe("node", [400, 410], "Flight")
   #
#  print(tabulate(df_nodes2, headers='keys', tablefmt='psql'))
print(df_nodes2.compute())


#  From this method invocation,
#
#     df_edges  = f_create_dataframe("edge", [100, 103, 2, 400, 410], "TAKES_FLIGHT")
#
#  Create 2 TAKES_FLIGHT edges, for each left edge node in the
#  range 100-103. Whatever this loop creates, assign right-edge
#  key value sequentially from 400 to 410.
#
#  If we need more right-edge records, start over at 400.
#
#  If there are not enough left-edge records produced, not
#  all right-edge nodes may recieve an edge.
#
#     If so, you may call this method again, say with these
#     arguments,
#        df_edges  = f_create_dataframe("edge", [100, 103, 1, 404, 406, "TAKES_FLIGHT")
#        df_edges  = f_create_dataframe("edge", [100, 103, 1, 407, 408, "TAKES_FLIGHT")
#        df_edges  = f_create_dataframe("edge", [100, 103, 1, 409, 410, "TAKES_FLIGHT")
#
#     or whatever
#
#  Per a standard graph import, if you create edges for which there is no node,
#  the record is discarded.
#
#  Edges are automaticallt created to be bi-directional.
#


df_edges  = f_create_dataframe("edge", [100, 103, 2, 400, 410], "TAKES_FLIGHT")
   #
# print(tabulate(df_edges, headers='keys', tablefmt='psql'))
print(df_edges.compute())


del df_nodes1
del df_nodes2
del df_edges

                               
print("--")

#  Sample output
#
#     +----+-------------+--------+--------+--------+---------+
#     |    |   person_id |   col2 |   col3 |   col4 | LABEL   |
#     |----+-------------+--------+--------+--------+---------|
#     |  0 |         100 |    222 |    333 |    444 | Person  |
#     |  1 |         101 |    222 |    333 |    444 | Person  |
#     |  2 |         102 |    222 |    333 |    444 | Person  |
#     |  3 |         103 |    222 |    333 |    444 | Person  |
#     +----+-------------+--------+--------+--------+---------+
#     +----+-------------+--------+--------+--------+---------+
#     |    |   flight_id |   col2 |   col3 |   col4 | LABEL   |
#     |----+-------------+--------+--------+--------+---------|
#     |  0 |         400 |    222 |    333 |    444 | Flight  |
#     |  1 |         401 |    222 |    333 |    444 | Flight  |
#     |  2 |         402 |    222 |    333 |    444 | Flight  |
#     |  3 |         403 |    222 |    333 |    444 | Flight  |
#     |  4 |         404 |    222 |    333 |    444 | Flight  |
#     |  5 |         405 |    222 |    333 |    444 | Flight  |
#     |  6 |         406 |    222 |    333 |    444 | Flight  |
#     |  7 |         407 |    222 |    333 |    444 | Flight  |
#     |  8 |         408 |    222 |    333 |    444 | Flight  |
#     |  9 |         409 |    222 |    333 |    444 | Flight  |
#     | 10 |         410 |    222 |    333 |    444 | Flight  |
#     +----+-------------+--------+--------+--------+---------+
#     +----+------------+----------+----------+------------+--------------+
#     |    |   start_id |   end_id |   weight |   uniq_col | TYPE         |
#     |----+------------+----------+----------+------------+--------------|
#     |  0 |        100 |      400 |      444 |          1 | TAKES_FLIGHT |
#     |  1 |        400 |      100 |      444 |          1 | TAKES_FLIGHT |
#     |  2 |        100 |      401 |      444 |          2 | TAKES_FLIGHT |
#     |  3 |        401 |      100 |      444 |          2 | TAKES_FLIGHT |
#     |  4 |        101 |      402 |      444 |          1 | TAKES_FLIGHT |
#     |  5 |        402 |      101 |      444 |          1 | TAKES_FLIGHT |
#     |  6 |        101 |      403 |      444 |          2 | TAKES_FLIGHT |
#     |  7 |        403 |      101 |      444 |          2 | TAKES_FLIGHT |
#     |  8 |        102 |      404 |      444 |          1 | TAKES_FLIGHT |
#     |  9 |        404 |      102 |      444 |          1 | TAKES_FLIGHT |
#     | 10 |        102 |      405 |      444 |          2 | TAKES_FLIGHT |
#     | 11 |        405 |      102 |      444 |          2 | TAKES_FLIGHT |
#     | 12 |        103 |      406 |      444 |          1 | TAKES_FLIGHT |
#     | 13 |        406 |      103 |      444 |          1 | TAKES_FLIGHT |
#     | 14 |        103 |      407 |      444 |          2 | TAKES_FLIGHT |
#     | 15 |        407 |      103 |      444 |          2 | TAKES_FLIGHT |
#     +----+------------+----------+----------+------------+--------------+

                              

#  Step 02:  Make Some Data, Import into Graph ..

In [7]:


#  First, make some data
#

df_persons1 = f_create_dataframe("node", [100, 199], "Person")
df_persons2 = f_create_dataframe("node", [300, 399], "Person")
df_persons3 = f_create_dataframe("node", [400, 499], "Person")

df_flights0 = f_create_dataframe("node", [800, 810], "Flight")                          #  800-810 is actually 11 rows, just fyi
 

#  Since there are so many more People then Flights, it's easier to
#  invoke the Edge creation from (Small --> Large)
#
#  Edges are automatically created to be bi-directional
#

df_edges1  = f_create_dataframe("edge", [800, 810, 5, 400, 410], "TAKES_FLIGHT")
df_edges2  = f_create_dataframe("edge", [800, 810, 1, 411, 499], "TAKES_FLIGHT")
   #
df_edges3  = f_create_dataframe("edge", [800, 810, 2, 100, 199], "TAKES_FLIGHT")
df_edges4  = f_create_dataframe("edge", [800, 810, 2, 300, 399], "TAKES_FLIGHT")


print("--")


--


In [8]:

#  At this point (from above), we expect to have an empty graph
#

from katana.remote import import_data


#  Persons
#
for l_each in [df_persons1, df_persons2, df_persons3]:
      #
   l_each2 = f_copy_id_col(l_each, "person_id")
      #
   with import_data.DataFrameImporter(my_graph) as df_importer:   
      df_importer.nodes_dataframe(
         l_each2[["id", "col3"]],                                  #  Example of subscripting columns
         id_column             = "id",
         id_space              = "Person",  
         label                 = "Person",  
         ) 
            #
      df_importer.insert()
    
    
#  Flights
#
for l_each in [df_flights0]:
      #
   l_each2 = f_copy_id_col(l_each, "flight_id")
      #
   with import_data.DataFrameImporter(my_graph) as df_importer:   
      df_importer.nodes_dataframe(
         l_each2,
         id_column             = "id",
         id_space              = "Flight",  
         label                 = "Flight",  
         ) 
            #
      df_importer.insert()
    
    
#  Edges; TAKES_FLIGHT
#
#  Ours are birectional, so some of the ordering below does not matter
#
for l_each in [df_edges1, df_edges2, df_edges3, df_edges4]:
      #
   with import_data.DataFrameImporter(my_graph) as df_importer:   
      df_importer.edges_dataframe(
         l_each, 
         source_id_space       = "Person", 
         destination_id_space  = "Flight",   
         source_column         = "start_id",
         destination_column    = "end_id",
         type                  = "TAKES_FLIGHT"
         )
            #
      df_importer.node_id_property_name("id")                        #  This line is required when you are not inserting any Nodes, only Edges
         #
      df_importer.insert()


display(my_graph.num_nodes())
display(my_graph.num_edges())
   #
print("--")




          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]



          0/? [?op/s]

          0/? [?op/s]



          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

          0/? [?op/s]

311

          0/? [?op/s]

110

--


#  Step 03:  Checking our work ..

In [None]:

#  Draw the graph
#

l_result = my_graph.query("""

   MATCH ( n ) - [ r ] -> ( m )
   RETURN n, r, m
   LIMIT 1000
   
   """, contextualize=True)

l_result.view()


In [9]:

#  Checking the subscripting of columns, above
#

l_result = my_graph.query("""
   MATCH ( n: Person )
   RETURN n
   LIMIT 5
   """, contextualize=True)
      #
display(l_result)

l_result = my_graph.query("""
   MATCH ( n: Flight )
   RETURN n
   LIMIT 5
   """, contextualize=True)
      #
display(l_result)

#  Sample output,
#
#     	n.internal_id 	n.labels 	n.col3 	n.id 	n.type
#     0 	0 	[Person] 	333 	100 	node
#     1 	1 	[Person] 	333 	102 	node
#     2 	2 	[Person] 	333 	104 	node
#     3 	3 	[Person] 	333 	108 	node
#     4 	4 	[Person] 	333 	111 	node
#     
#     	n.internal_id 	n.labels 	n.LABEL 	n.col2 	n.col3 	n.col4 	n.flight_id 	n.id 	n.type
#     0 	94 	[Flight] 	Flight 	222 	333 	444 	805 	805 	node
#     1 	95 	[Flight] 	Flight 	222 	333 	444 	806 	806 	node
#     2 	96 	[Flight] 	Flight 	222 	333 	444 	807 	807 	node
#     3 	97 	[Flight] 	Flight 	222 	333 	444 	808 	808 	node
#     4 	98 	[Flight] 	Flight 	222 	333 	444 	810 	810 	node


          0/? [?op/s]

          0/? [?op/s]

Unnamed: 0,n.internal_id,n.labels,n.col3,n.id,n.type
0,0,[Person],333,100,node
1,1,[Person],333,102,node
2,2,[Person],333,104,node
3,3,[Person],333,108,node
4,4,[Person],333,111,node


          0/? [?op/s]

          0/? [?op/s]

Unnamed: 0,n.internal_id,n.labels,n.LABEL,n.col2,n.col3,n.col4,n.flight_id,n.id,n.type
0,94,[Flight],Flight,222,333,444,805,805,node
1,95,[Flight],Flight,222,333,444,806,806,node
2,96,[Flight],Flight,222,333,444,807,807,node
3,97,[Flight],Flight,222,333,444,808,808,node
4,98,[Flight],Flight,222,333,444,810,810,node
