In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  Test, How to fetch data from a graph
#


#  Part 01:  Setup, generate data

In [None]:

NUM_PARTITIONS  = 3
NUM_ROWS        = 40000

print("--")


In [None]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


In [None]:


#  Generates (NUM_ROWS) rows of data
#

l_row = {"col1": list(), "col2": list()}
   #
for l_cntr in range(NUM_ROWS):
   l_row["col1"].append(l_cntr)
   l_row["col2"].append("Item number: " + str(l_cntr))
    
pd_data = pd.DataFrame(l_row, columns = ["col1", "col2"])
dd_data = from_pandas(pd_data, npartitions = NUM_PARTITIONS)


l_cntr = 0
   #
for l_each in dd_data.itertuples():
   l_cntr += 1
      #
   if (l_cntr <5):
      print("Row (%d) of (%d) total rows.   Col1: (%4d)   Col2: (%s)" % (l_cntr, len(dd_data), l_each.col1, l_each.col2))
        
print("--")



#  Part 02:  Create a graph

In [None]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph" 

print("--")


In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPHS

my_graph=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


#  Part 03:  Import into the graph

In [None]:

with import_data.DataFrameImporter(my_graph) as df_importer:
    
   #  Movies
   #
   df_importer.nodes_dataframe(
      dd_data[["col1", "col2"]],
      id_column  = "col1",
      id_space   = "dd_data"
      )

   df_importer.insert()


print("--")


In [None]:

display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))

#  Sample outpt,
#
#     'Number of Graph Nodes: 40000'
#     'Number of Graph Edges: 0'


#  Part 04:  Run a traversal, return results

In [None]:

#  Aggregates, by definition, can return only one row. We use a slightly different
#  technique to get the return value.
#

l_query  = """
   MATCH (n) 
   RETURN COUNT(*) AS cnt
   """.format()

l_count = my_graph.query(l_query)["cnt"][0]


print(l_count)
print(type(l_count))

#  Sample output,
#
#     40000
#     <class 'numpy.int64'>


In [None]:

#  Returning actual data, scale: 10 rows
#

l_rows = 10

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result1 = my_graph.query(l_query)


print(tabulate(dd_result1, headers='keys', tablefmt='psql'))
print("")
print(type(dd_result1))

#  Sample output,
#
#     +----+--------+-----------------+
#     |    |   col1 | col2            |
#     |----+--------+-----------------|
#     |  0 |      1 | Item number: 1  |
#     |  1 |      6 | Item number: 6  |
#     |  2 |      8 | Item number: 8  |
#     |  3 |     14 | Item number: 14 |
#     |  4 |     17 | Item number: 17 |
#     |  5 |     21 | Item number: 21 |
#     |  6 |     22 | Item number: 22 |
#     |  7 |     26 | Item number: 26 |
#     |  8 |     28 | Item number: 28 |
#     |  9 |     30 | Item number: 30 |
#     +----+--------+-----------------+
#
#     <class 'katana.remote.ResultSet'>


In [None]:

#  Return (NUM_ROWS: 40,000) rows.
#
#     This as step 1 when we want to validate contents, or use graph data
#     for ML routines, other.
#
#     You should see the Python kernel hang, become unresponsive.
#        You'll see the upper-right GIG wheel hang. Or, just try running
#        the next cell; it's hung.


#  l_rows = 40000                                #  My original use case, Hangs
#  l_rows = 5000                                 #  Takes 10 (?) seconds, then runs/continues
   #
l_rows = 1000

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result2 = my_graph.query(l_query)


l_cntr = 0
   #
for l_each in dd_result2.itertuples():
   l_cntr += 1  
      #
   if (l_cntr < 10):
      print("Counter: %-4d   (%s)   (%s)" % (l_cntr, l_each.col1, l_each.col2))

print("")
print("Number of rows in DataFrame: %d   Value of l_cntr: %d" % (len(dd_result2), l_cntr))
print("--")

#  Sample output,
#
#     Counter: 1      (0)   (Item number: 0)
#     Counter: 2      (5)   (Item number: 5)
#     Counter: 3      (6)   (Item number: 6)
#     Counter: 4      (8)   (Item number: 8)
#     Counter: 5      (9)   (Item number: 9)
#     Counter: 6      (11)   (Item number: 11)
#     Counter: 7      (12)   (Item number: 12)
#     Counter: 8      (13)   (Item number: 13)
#     Counter: 9      (17)   (Item number: 17)
#  
#     Number of rows in DataFrame: 1000   Value of l_cntr: 1000


In [None]:

#  Use this to see if/when next cell will run
#
print("--")


In [20]:

#  Move the above to paginated fetching ..
#

#  l_rows = 40000
#  l_rows = 5000 
l_rows = 1000

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)


#  Returns,  katana_enterprise.remote.sync_wrappers.PaginatedResultSet
#     It has no length
#     It has no itertuples()
#     It does have a head, maxmimum 40
#
dd_result3 = my_graph.query_paginated(l_query)


#  print(dd_result3.head(10))
#
#   Maximum value: 40
#
#     print(dd_result3.head(44))
#     >> ValueError: You can only view a maximum of 40 rows.


l_cntr = 0
l_row2 = {"col1": list(), "col2": list()}
   #
for l_each in dd_result3:
   l_cntr += 1
      #
   if (l_cntr < 10):
      # print("Counter: %-4d   (%s)   (%s)" % (l_cntr, l_each.col1, l_each.col2))
      l_row["col1"].append(l_each)

print("")
print(l_cntr)


print(l_row)





          0/? [?op/s]


3
{'col1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21