In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  Test, How to fetch data from a graph
#


#  Part 01:  Setup, generate data

In [1]:

NUM_PARTITIONS  = 3
NUM_ROWS        = 40000

print("--")


--


In [2]:

import numpy as np
import pandas as pd
   #
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


--


In [3]:


#  Generates (NUM_ROWS) rows of data
#

l_row = {"col1": list(), "col2": list()}
   #
for l_cntr in range(NUM_ROWS):
   l_row["col1"].append(l_cntr)
   l_row["col2"].append("Item number: " + str(l_cntr))
    
pd_data = pd.DataFrame(l_row, columns = ["col1", "col2"])
dd_data = from_pandas(pd_data, npartitions = NUM_PARTITIONS)


l_cntr = 0
   #
for l_each in dd_data.itertuples():
   l_cntr += 1
      #
   if (l_cntr <5):
      print("Row (%d) of (%d) total rows.   Col1: (%4d)   Col2: (%s)" % (l_cntr, len(dd_data), l_each.col1, l_each.col2))
        
print("--")



Row (1) of (40000) total rows.   Col1: (   0)   Col2: (Item number: 0)
Row (2) of (40000) total rows.   Col1: (   1)   Col2: (Item number: 1)
Row (3) of (40000) total rows.   Col1: (   2)   Col2: (Item number: 2)
Row (4) of (40000) total rows.   Col1: (   3)   Col2: (Item number: 3)
--


#  Part 02:  Create a graph

In [4]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f86a44c33d0>


In [5]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph" 

print("--")


--


In [6]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


--


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [7]:

#  CREATE GRAPHS

my_graph=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, 9dwieJRAjK2zdomzg33V1ueRvzmAUoi39WgMPz7Cupkv, 0>


#  Part 03:  Import into the graph

In [8]:

with import_data.DataFrameImporter(my_graph) as df_importer:
    
   #  Movies
   #
   df_importer.nodes_dataframe(
      dd_data[["col1", "col2"]],
      id_column  = "col1",
      id_space   = "dd_data"
      )

   df_importer.insert()


print("--")




          0/? [?op/s]

          0/? [?op/s]

--


In [9]:

display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))

#  Sample outpt,
#
#     'Number of Graph Nodes: 40000'
#     'Number of Graph Edges: 0'


          0/? [?op/s]

'Number of Graph Nodes: 40000'

          0/? [?op/s]

'Number of Graph Edges: 0'

#  Part 04:  Run a traversal, return results

In [10]:

#  Aggregates, by definition, can return only one row. We use a slightly different
#  technique to get the return value.
#

l_query  = """
   MATCH (n) 
   RETURN COUNT(*) AS cnt
   """.format()

l_count = my_graph.query(l_query)["cnt"][0]


print(l_count)
print(type(l_count))

#  Sample output,
#
#     40000
#     <class 'numpy.int64'>


          0/? [?op/s]

40000
<class 'numpy.int64'>


In [11]:

#  Returning actual data, scale: 10 rows
#

l_rows = 10

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result1 = my_graph.query(l_query)


print(tabulate(dd_result1, headers='keys', tablefmt='psql'))
print("")
print(type(dd_result1))

#  Sample output,
#
#     +----+--------+-----------------+
#     |    |   col1 | col2            |
#     |----+--------+-----------------|
#     |  0 |      1 | Item number: 1  |
#     |  1 |      6 | Item number: 6  |
#     |  2 |      8 | Item number: 8  |
#     |  3 |     14 | Item number: 14 |
#     |  4 |     17 | Item number: 17 |
#     |  5 |     21 | Item number: 21 |
#     |  6 |     22 | Item number: 22 |
#     |  7 |     26 | Item number: 26 |
#     |  8 |     28 | Item number: 28 |
#     |  9 |     30 | Item number: 30 |
#     +----+--------+-----------------+
#
#     <class 'katana.remote.ResultSet'>


          0/? [?op/s]

+----+--------+-----------------+
|    |   col1 | col2            |
|----+--------+-----------------|
|  0 |      6 | Item number: 6  |
|  1 |      7 | Item number: 7  |
|  2 |     11 | Item number: 11 |
|  3 |     12 | Item number: 12 |
|  4 |     14 | Item number: 14 |
|  5 |     15 | Item number: 15 |
|  6 |     16 | Item number: 16 |
|  7 |     17 | Item number: 17 |
|  8 |     19 | Item number: 19 |
|  9 |     22 | Item number: 22 |
+----+--------+-----------------+

<class 'katana.remote.ResultSet'>


In [None]:

#  Return (NUM_ROWS: 40,000) rows.
#
#     This as step 1 when we want to validate contents, or use graph data
#     for ML routines, other.
#
#     You should see the Python kernel hang, become unresponsive.
#        You'll see the upper-right GIF wheel hang. Or, just try running
#        the next cell; it's hung.


#  l_rows = 40000                                #  My original use case, Hangs
#  l_rows = 5000                                 #  Takes 10 (?) seconds, then runs/continues
   #
l_rows = 1000

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result2 = my_graph.query(l_query)


l_cntr = 0
   #
for l_each in dd_result2.itertuples():
   l_cntr += 1  
      #
   if (l_cntr < 10):
      print("Counter: %-4d   (%s)   (%s)" % (l_cntr, l_each.col1, l_each.col2))

print("")
print("Number of rows in DataFrame: %d   Value of l_cntr: %d" % (len(dd_result2), l_cntr))
print("--")

#  Sample output,
#
#     Counter: 1      (0)   (Item number: 0)
#     Counter: 2      (5)   (Item number: 5)
#     Counter: 3      (6)   (Item number: 6)
#     Counter: 4      (8)   (Item number: 8)
#     Counter: 5      (9)   (Item number: 9)
#     Counter: 6      (11)   (Item number: 11)
#     Counter: 7      (12)   (Item number: 12)
#     Counter: 8      (13)   (Item number: 13)
#     Counter: 9      (17)   (Item number: 17)
#  
#     Number of rows in DataFrame: 1000   Value of l_cntr: 1000


In [None]:

#  Use this to see if/when next cell will run
#
print("--")


In [39]:

#  Move the above to paginated fetching ..
#

l_rows = 40000
#  l_rows = 5000 
#  l_rows = 10000

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)


dd_result4 = my_graph.query_paginated(l_query)
#
#  Returns,  katana_enterprise.remote.sync_wrappers.PaginatedResultSet
#     It has no length
#     It has no itertuples()
#     It does have a head, maxmimum 40


#  l_resultA = dd_result3.head(10)
#  print(type(l_resultA))
      #
#   Maximum value: 40
#
#     print(dd_result3.head(44))
#     >> ValueError: You can only view a maximum of 40 rows.


#  Using a list
#
my_list = []
   #
for l_each in dd_result4:
   my_list.append(l_each)


l_cntr = 0
   #
for l_col1, l_col2 in zip(my_list[0].col1, my_list[0].col2):
   l_cntr += 1
      #
   if (l_cntr < 10):
      print("Counter: %-4d   (%s)   (%s)" % (l_cntr, l_col1, l_col2))
        
print("")
print("Number of rows in List: %d   Value of l_cntr: %d" % (len(my_list[0].col1), l_cntr))
print("")
   #
del my_list

      
#  Using a Pandas DataFrame
#
my_pd = pd.DataFrame()
   #
for l_each in dd_result4:
   aaa = l_each[1]
   print(type(aaa))
   # print(aaa[0].col1)
   # my_pd.append(l_each)



#  dict = {'Name':['Martha', 'Tim', 'Rob', 'Georgia'],
#          'Maths':[87, 91, 97, 95],
#          'Science':[83, 99, 84, 76]
#         }
#    
#  df = pd.DataFrame(dict)
#    
#  display(df)
#    
#  df2 = {'Name': 'Amy', 'Maths': 89, 'Science': 93}
#  df = df.append(df2, ignore_index = True)
#    
#  display(df)



          0/? [?op/s]

Counter: 1      (29997)   (Item number: 29997)
Counter: 2      (30003)   (Item number: 30003)
Counter: 3      (30006)   (Item number: 30006)
Counter: 4      (30007)   (Item number: 30007)
Counter: 5      (30008)   (Item number: 30008)
Counter: 6      (30009)   (Item number: 30009)
Counter: 7      (30011)   (Item number: 30011)
Counter: 8      (30014)   (Item number: 30014)
Counter: 9      (30019)   (Item number: 30019)

Number of rows in List: 40000   Value of l_cntr: 40000



KeyError: '1'