In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  Test, How to fetch data from a graph
#


#  Part 01:  Setup, generate data

In [None]:

NUM_PARTITIONS  = 3
NUM_ROWS        = 40000

print("--")


In [None]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


In [4]:


#  Generates (NUM_ROWS) rows of data
#

l_row = {"col1": list(), "col2": list()}
   #
for l_cntr in range(NUM_ROWS):
   l_row["col1"].append(l_cntr)
   l_row["col2"].append("Item number: " + str(l_cntr))
    
pd_data = pd.DataFrame(l_row, columns = ["col1", "col2"])
dd_data = from_pandas(pd_data, npartitions = NUM_PARTITIONS)


l_cntr = 0
   #
for l_each in dd_data.itertuples():
   l_cntr += 1
      #
   if (l_cntr <5):
      print("Row (%d) of (%d) total rows.   Col1: (%4d)   Col2: (%s)" % (l_cntr, len(dd_data), l_each.col1, l_each.col2))
        
print("--")



Row (1) of (40000) total rows.   Col1: (   0)   Col2: (Item number: 0)
Row (2) of (40000) total rows.   Col1: (   1)   Col2: (Item number: 1)
Row (3) of (40000) total rows.   Col1: (   2)   Col2: (Item number: 2)
Row (4) of (40000) total rows.   Col1: (   3)   Col2: (Item number: 3)
--


#  Part 02:  Create a graph

In [5]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f2e987fe250>


In [6]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph" 

print("--")


--


In [7]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


--


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [9]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


APZZqzBeU532GbGbAM8UxbaJNYSRSSSd8zZ5DG7K1HA2


In [10]:

#  CREATE GRAPHS

my_graph=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, 4QYz5CMGyvafefW9qJCftMCku7S3WM6mCZSYCpMNDcFz, 0>


#  Part 03:  Import into the graph

In [11]:

with import_data.DataFrameImporter(my_graph) as df_importer:
    
   #  Movies
   #
   df_importer.nodes_dataframe(
      dd_data[["col1", "col2"]],
      id_column  = "col1",
      id_space   = "dd_data"
      )

   df_importer.insert()


print("--")




          0/? [?op/s]

          0/? [?op/s]

--


In [12]:

display("Number of Graph Nodes: %d" % (my_graph.num_nodes()))
display("Number of Graph Edges: %s" % (my_graph.num_edges()))

#  Sample outpt,
#
#     'Number of Graph Nodes: 40000'
#     'Number of Graph Edges: 0'


          0/? [?op/s]

'Number of Graph Nodes: 40000'

          0/? [?op/s]

'Number of Graph Edges: 0'

#  Part 04:  Run a traversal, return results

In [13]:

#  Look at the data, scale: 10 rows

l_rows = 10

l_query  = """
   MATCH (n) 
   RETURN n.col1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result1 = my_graph.query(l_query)


print(tabulate(dd_result1, headers='keys', tablefmt='psql'))

#  Sample output,
#
#     +----+--------+-----------------+
#     |    |   col1 | col2            |
#     |----+--------+-----------------|
#     |  0 |      1 | Item number: 1  |
#     |  1 |      6 | Item number: 6  |
#     |  2 |      8 | Item number: 8  |
#     |  3 |     14 | Item number: 14 |
#     |  4 |     17 | Item number: 17 |
#     |  5 |     21 | Item number: 21 |
#     |  6 |     22 | Item number: 22 |
#     |  7 |     26 | Item number: 26 |
#     |  8 |     28 | Item number: 28 |
#     |  9 |     30 | Item number: 30 |
#     +----+--------+-----------------+


          0/? [?op/s]

+----+--------+-----------------+
|    |   col1 | col2            |
|----+--------+-----------------|
|  0 |      1 | Item number: 1  |
|  1 |      5 | Item number: 5  |
|  2 |     13 | Item number: 13 |
|  3 |     17 | Item number: 17 |
|  4 |     18 | Item number: 18 |
|  5 |     24 | Item number: 24 |
|  6 |     30 | Item number: 30 |
|  7 |     32 | Item number: 32 |
|  8 |     35 | Item number: 35 |
|  9 |     36 | Item number: 36 |
+----+--------+-----------------+


In [None]:

#  Return (NUM_ROWS: 40,000) rows.
#
#     This as step 1 when we want to validate contents, or use graph data
#     for ML routines, other.
#
#     You should see the Python kernel hang, become unresponsive.
#        You'll see the upper-right GIG wheel hang. Or, just try running
#        the next cell; it's hung.


l_rows = 40000                                #  My original use case, Hangs
#  l_rows = 5000                                    #  Takes 10 (?) seconds, then runs/continues

l_query  = """
   MATCH (n) 
   RETURN n.ol1 AS col1, n.col2 AS col2
   LIMIT {0}
   """.format(l_rows)

dd_result2 = my_graph.query(l_query)


l_cntr = 0
   #
for l_each in dd_result2.itertuples():
   l_cntr += 1  
      #
   if (l_cntr < 10):
      print(l_each)



          0/? [?op/s]

Pandas(Index=0, col1=None, col2='Item number: 1')
Pandas(Index=1, col1=None, col2='Item number: 5')
Pandas(Index=2, col1=None, col2='Item number: 13')
Pandas(Index=3, col1=None, col2='Item number: 17')
Pandas(Index=4, col1=None, col2='Item number: 18')
Pandas(Index=5, col1=None, col2='Item number: 24')
Pandas(Index=6, col1=None, col2='Item number: 30')
Pandas(Index=7, col1=None, col2='Item number: 32')
Pandas(Index=8, col1=None, col2='Item number: 35')


In [None]:

print("--")
