In [1]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  Test, How to fetch data from a graph
#


#  Part 01:  Setup, generate data

In [2]:

NUM_PARTITIONS  = 3
NUM_ROWS        = 40000

print("--")


--


In [3]:

import numpy as np
import pandas as pd
   #
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


--


In [4]:


#  Generates (NUM_ROWS) rows of data
#

l_row = {"col1": list(), "col2": list()}
   #
for l_cntr in range(NUM_ROWS):
   l_row["col1"].append(l_cntr)
   l_row["col2"].append("Item number: " + str(l_cntr))
    
pd_data = pd.DataFrame(l_row, columns = ["col1", "col2"])
dd_data = from_pandas(pd_data, npartitions = NUM_PARTITIONS)


l_cntr = 0
   #
for l_each in dd_data.itertuples():
   l_cntr += 1
      #
   if (l_cntr <5):
      print("Row (%d) of (%d) total rows.   Col1: (%4d)   Col2: (%s)" % (l_cntr, len(dd_data), l_each.col1, l_each.col2))
        
print("--")



Row (1) of (40000) total rows.   Col1: (   0)   Col2: (Item number: 0)
Row (2) of (40000) total rows.   Col1: (   1)   Col2: (Item number: 1)
Row (3) of (40000) total rows.   Col1: (   2)   Col2: (Item number: 2)
Row (4) of (40000) total rows.   Col1: (   3)   Col2: (Item number: 3)
--


#  Part 02:  Create a graph

In [5]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f5585ae5fa0>


In [6]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph" 

print("--")


--


In [7]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


--


In [8]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


--
DB ID:  8hhMf8JLDC1qkKSgSHj7pTKCBJzR1vb5LtSRy9qMpFQW      DB Name:  default


In [9]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


9NSHd8mPczDg7GJ8ie91trmytXh1MWhC3hynFqeUFK7y


In [10]:

#  CREATE GRAPHS

my_graph=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, ShfSLBgFBmQ25QD49kzW8ia8LhqKuGEuHfizS7CDjEw, 0>


#  Part 03:  Import into the graph

In [11]:

with import_data.DataFrameImporter(my_graph) as df_importer:
    
   #  Movies
   #
   df_importer.nodes_dataframe(
      dd_data[["col1", "col2"]],
      id_column  = "col1",
      id_space   = "dd_data"
      )

   df_importer.insert()


print("--")




          0/? [?op/s]

          0/? [?op/s]

--


In [12]:

display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))

#  Sample outpt,
#
#     'Number of Graph Nodes: 40000'
#     'Number of Graph Edges: 0'


          0/? [?op/s]

'Number of Graph Nodes: 40000'

          0/? [?op/s]

'Number of Graph Edges: 0'

#  Part 04:  Run a traversal, return results

In [13]:

#  Aggregates, by definition, can return only one row. We use a slightly different
#  technique to get the return value.
#

l_query  = """
   MATCH (n) 
   RETURN COUNT(*) AS cnt
   """.format()

l_count = my_graph.query(l_query)["cnt"][0]


print(l_count)
print(type(l_count))

#  Sample output,
#
#     40000
#     <class 'numpy.int64'>


          0/? [?op/s]

40000
<class 'numpy.int64'>


In [14]:

#  Returning actual data, scale: 10 rows
#

l_rows = 10

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result1 = my_graph.query(l_query)


print(tabulate(dd_result1, headers='keys', tablefmt='psql'))
print("")
print(type(dd_result1))

#  Sample output,
#
#     +----+--------+-----------------+
#     |    |   col1 | col2            |
#     |----+--------+-----------------|
#     |  0 |      1 | Item number: 1  |
#     |  1 |      6 | Item number: 6  |
#     |  2 |      8 | Item number: 8  |
#     |  3 |     14 | Item number: 14 |
#     |  4 |     17 | Item number: 17 |
#     |  5 |     21 | Item number: 21 |
#     |  6 |     22 | Item number: 22 |
#     |  7 |     26 | Item number: 26 |
#     |  8 |     28 | Item number: 28 |
#     |  9 |     30 | Item number: 30 |
#     +----+--------+-----------------+
#
#     <class 'katana.remote.ResultSet'>


          0/? [?op/s]

+----+--------+--------------------+
|    |   col1 | col2               |
|----+--------+--------------------|
|  0 |  30110 | Item number: 30110 |
|  1 |  30114 | Item number: 30114 |
|  2 |  30116 | Item number: 30116 |
|  3 |  30126 | Item number: 30126 |
|  4 |  30127 | Item number: 30127 |
|  5 |  30128 | Item number: 30128 |
|  6 |  30129 | Item number: 30129 |
|  7 |  30136 | Item number: 30136 |
|  8 |  30137 | Item number: 30137 |
|  9 |  30138 | Item number: 30138 |
+----+--------+--------------------+

<class 'katana.remote.ResultSet'>


In [15]:

#  Return (NUM_ROWS: 40,000) rows.
#
#     This as step 1 when we want to validate contents, or use graph data
#     for ML routines, other.
#
#     You should see the Python kernel hang, become unresponsive.
#        You'll see the upper-right GIF wheel hang. Or, just try running
#        the next cell; it's hung.


#  l_rows = 40000                                #  My original use case, Hangs
#  l_rows = 5000                                 #  Takes 10 (?) seconds, then runs/continues
   #
l_rows = 1000

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result2 = my_graph.query(l_query)


l_cntr = 0
   #
for l_each in dd_result2.itertuples():
   l_cntr += 1  
      #
   if (l_cntr < 10):
      print("Counter: %-4d   (%s)   (%s)" % (l_cntr, l_each.col1, l_each.col2))

print("")
print("Number of rows in DataFrame: %d   Value of l_cntr: %d" % (len(dd_result2), l_cntr))
print("--")

#  Sample output,
#
#     Counter: 1      (0)   (Item number: 0)
#     Counter: 2      (5)   (Item number: 5)
#     Counter: 3      (6)   (Item number: 6)
#     Counter: 4      (8)   (Item number: 8)
#     Counter: 5      (9)   (Item number: 9)
#     Counter: 6      (11)   (Item number: 11)
#     Counter: 7      (12)   (Item number: 12)
#     Counter: 8      (13)   (Item number: 13)
#     Counter: 9      (17)   (Item number: 17)
#  
#     Number of rows in DataFrame: 1000   Value of l_cntr: 1000


          0/? [?op/s]

Counter: 1      (1)   (Item number: 1)
Counter: 2      (5)   (Item number: 5)
Counter: 3      (7)   (Item number: 7)
Counter: 4      (8)   (Item number: 8)
Counter: 5      (10)   (Item number: 10)
Counter: 6      (12)   (Item number: 12)
Counter: 7      (15)   (Item number: 15)
Counter: 8      (19)   (Item number: 19)
Counter: 9      (20)   (Item number: 20)

Number of rows in DataFrame: 1000   Value of l_cntr: 1000
--


In [None]:

#  Use this to see if/when next cell will run
#
print("--")


In [None]:

#  Move the above to paginated fetching ..
#

#  l_rows = 40000
#  l_rows = 5000 
l_rows = 10000

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)


dd_result4 = my_graph.query_paginated(l_query)
#
#  Returns,  katana_enterprise.remote.sync_wrappers.PaginatedResultSet
#     It has no length
#     It has no itertuples()
#     It does have a head, maxmimum 40
#
#  The above is true, because of [ paging ]. This is a list of lists.
#  See example below.


#  l_resultA = dd_result3.head(10)
#  print(type(l_resultA))
      #
#   Maximum value: 40
#
#     print(dd_result3.head(44))
#     >> ValueError: You can only view a maximum of 40 rows.


#  Using a query_paginated()
#
l_cntr1 = 0
l_cntr2 = 0
   #
for l_page in dd_result4:
   l_cntr1 += 1
   for l_each in l_page.itertuples():
      l_cntr2 += 1
         #
      if (l_cntr2 < 10):
        print("Counter: %-4d   (%s)   (%s)" % (l_cntr2, l_each.col1, l_each.col2))

print("")
print("Number of rows in total result set: %d   Number of pages: %d" % (l_cntr1, l_cntr2))

        
        
        
        