#  Setup: Display options

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


# Setup: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)
   #
print(f"katana version: {my_client.server_version}")


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [63]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


--


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [64]:

#  CREATE GRAPH

my_graph = my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, 5Jfj9nEfjMdGF5ECRJTrwtM2MqAHsw221YbNb7KkAFwn, 0>


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


# Step 01:  Process CSV files, make Dask distributed DataFrames ..

In [None]:

#  import os

import dask.dataframe as dd
import pandas
   #
from katana.remote.import_data import DataFrameImporter

print("--")


In [None]:

DEBUG_FLAG = True
# DEBUG_FLAG = False

TEST_ROWS = 100000

print("--")


In [None]:
%%time

#  Actual reading of CSV files ..
#
#  See also,
#     https://console.cloud.google.com/storage/browser/katana-demo-datasets/csv-datasets/banco_30M
#

l_prefix     = "gs://katana-demo-datasets/csv-datasets/banco_30M/"

l_node_csvs  = (l_prefix + "NodeFile_0")
   #
if (DEBUG_FLAG):
   l_nodesb     = dd.read_csv(l_node_csvs, sep = "|").head(n = TEST_ROWS)            #  Head makes this a Pandas DataFrame
   l_nodes      = dd.from_pandas(l_nodesb, npartitions = NUM_PARTITIONS)             #  Now a Dask DataFrame
else:
   l_nodes      = dd.read_csv(l_node_csvs, sep = "|")                                #  Dask DataFrame


#  Lazy evaluation until this block for nodes ..
#
l_nodes.compute()
   #
display("Number of nodes: %d" % ( len(l_nodes) ))


   ###
    

l_edgesb     = {}
l_edges      = {}
l_cntr       = 0
   #
for l_each in range(0, 15):
   l_edge_csvs     = (l_prefix + "transactions_" + str(l_each))
   print(".", end="")
      #
   if (DEBUG_FLAG):
      l_edgesb[l_each] = dd.read_csv(l_edge_csvs, sep = "|").head(n = TEST_ROWS)
      l_edges [l_each] = dd.from_pandas(l_edgesb[l_each], npartitions = NUM_PARTITIONS)
   else:
      l_edges [l_each] = dd.read_csv(l_edge_csvs, sep = "|")

   #  Normally lazy evaluation, until this block for edges ..
   #
   l_edges[l_each].compute()
   l_cntr += len(l_edges[l_each])
    
    
print("")
display("Number of edges: %d" % ( l_cntr       ))
   #    
print("--")    
    

# Step 02:  Actually import into the graph ..

In [65]:

#  The way we are doing this is,
#
#     .  Read all of the data into memory at one time.
#        This is not required, at all.
#
#     .  At least we are using Dask DataFrames, which are distributed.
#

with DataFrameImporter(my_graph) as df_importer:

   df_importer.nodes_dataframe(                                     #  Here, just the nodes
      l_nodes,
      id_column   = "id",
      id_space    = "accounts"
      )

   for l_each in l_edges.values():
        
      df_importer.edges_dataframe(
          l_each,
          source_id_space       = "accounts",
          destination_id_space  = "accounts",
          source_column         = "sourceAccount.id",
          destination_column    = "destinationAccount2.id",
          )
    
      #  Making this a bi-directional edge
      #
      df_importer.edges_dataframe(
          l_each,
          source_id_space       = "accounts",
          destination_id_space  = "accounts",
          destination_column    = "sourceAccount.id",
          source_column         = "destinationAccount2.id",
          )
    
   df_importer.insert()


print("--")




          0/? [?op/s]

          0/? [?op/s]

--


In [66]:

#  Get some counts, validations ..
#
#     If Testing/Debugging a small subset of data, you may get zero edges, because the edges we call to load 
#     do not match existing nodes.
#

display(my_graph.num_nodes())
display(my_graph.num_edges())



          0/? [?op/s]

100000

          0/? [?op/s]

668

# Step 03:  Traversals ..

In [None]:

l_result = my_graph.query("""

   MATCH (a) - [e] -> (b) - [] - (c) 
   RETURN a, b, c 
   LIMIT 1000 
   """, contextualize = True)

l_result.visualize()



In [None]:


l_result = my_graph.query("""

   MATCH (a: accounts {country: 'USA'} ) - [e] -> (b) 
   RETURN a, b
   """)

print(tabulate(l_result.head(10), headers='keys', tablefmt='psql'))


# Step 04:  Analytics ..

In [67]:

from katana.remote import analytics

l_cntr1 = 0
l_cntr2 = 0
   #
print("--")


--


In [68]:

#  Running PageRank ..
#
#     This cell needs to change for version 0.6 ..
#

#  Generate a unique prop name so we can run this over and over without edit ..
#
l_cntr1 += 1
   #
l_output_propname     = "page_rank_" + str(l_cntr1).zfill(4)


analytics.pagerank(my_graph, result_property_name = l_output_propname, max_iterations = 20, alpha = 0.15)

print("--")


          0/? [?op/s]

--


In [69]:

#  Outputting the result of PageRank ..
#

l_query = """
    MATCH (n)
    RETURN n, n.{0}
    ORDER BY n.{0} DESC
    LIMIT 10
    """.format(l_output_propname)

l_result = my_graph.query(l_query, contextualize=False)
    
print(tabulate(l_result, headers='keys', tablefmt='psql'))


          0/? [?op/s]

+----+-----------------+--------------+-------------+-------------+---------------------+------------------+--------+--------------------+--------------------+------------------+------------+----------+--------------------+
|    |   n.internal_id | n.labels     |    n.amount | n.country   | n.enrollmentDate    | n.firstName      |   n.id | n.lastName         |   n.page_rank_0001 | n.state          | n.status   | n.type   |   n.page_rank_0001 |
|----+-----------------+--------------+-------------+-------------+---------------------+------------------+--------+--------------------+--------------------+------------------+------------+----------+--------------------|
|  0 |             406 | ['accounts'] | 6.27797e+09 | India       | 2022-11-15T05:16:17 | j6f              |   1237 | M4OpcQwi           |            1.84454 | Haryana          | active     | node     |            1.84454 |
|  1 |             665 | ['accounts'] | 4.17569e+09 | Brazil      | 2022-11-15T05:16:17 | V             

In [71]:

# NOTE: Running on directed graph
#
#     This cell needs to change for version 0.6 ..


#  Generate a unique prop name so we can run this over and over without edit ..
#
l_cntr2 += 1
   #
l_output_propname     = "louvain_" + str(l_cntr2).zfill(4)


analytics.louvain_clustering(my_graph, result_property_name = l_output_propname)

print("--")


          0/? [?op/s]

--


In [72]:

#  Outputting the result of Louvain ..
#

l_query = """
    MATCH (n)
    RETURN n, n.{0}
    ORDER BY n.{0} DESC
    LIMIT 10
    """.format(l_output_propname)

l_result = my_graph.query(l_query, contextualize=False)
    
print(tabulate(l_result, headers='keys', tablefmt='psql'))


          0/? [?op/s]

+----+-----------------+--------------+-------------+-------------+---------------------+--------+---------------------+------------------+--------------------+---------------------------+------------+----------+----------------------+------------------+
|    |   n.internal_id | n.labels     |    n.amount | n.country   | n.enrollmentDate    |   n.id | n.lastName          |   n.louvain_0002 |   n.page_rank_0001 | n.state                   | n.status   | n.type   | n.firstName          |   n.louvain_0002 |
|----+-----------------+--------------+-------------+-------------+---------------------+--------+---------------------+------------------+--------------------+---------------------------+------------+----------+----------------------+------------------|
|  0 | 281474976726623 | ['accounts'] | 1.96276e+09 | Brazil      | 2022-11-15T05:16:17 |  48580 | QsRR6dZ             |  562949953454473 |               0.15 | Sergipe                   | active     | node     | nan                  |