In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


#  Setup: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  Delete all graphs

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  Delete all databases

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  Create database

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [None]:

#  Create a graph

my_graph=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


In [None]:

#  Connect to graph

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


# Step 01:  Load from CSV

In [None]:

import pandas as pd
import numpy as np

print("--")


In [None]:

#  Load a DataFrame from CSV, Nodes/Vertices


l_InputFile  = "./10_N_MovieDB/24_nodes.txt"

df_all_nodes1 = pd.read_csv(
   l_InputFile,
   delimiter = ",",
   dtype = {
      # "id"        : int,
      "id"        : np.dtype('O'),
      "_labels"   : np.dtype('O'),
      # "born"      : float, 
      "born"      : np.dtype('O'),
      "name"      : np.dtype('O'),
      # "released"  : float,
      "released"  : np.dtype('O'),
      "tagline"   : np.dtype('O'),
      "title"     : np.dtype('O')
      })

print("Number of rows: %d" % (len(df_all_nodes1)) )
   #
print("--")


In [None]:

#  Column renames

df_all_nodes2 = df_all_nodes1.rename(columns={"_id": "id", "_labels": "label"})

#  Change float values to integer, remove leading semicolon from label

df_all_nodes3 = df_all_nodes2.assign(
   # born     = lambda x: x.born.fillna(0.0).astype(int), 
   born     = lambda x: x.born.fillna(0.0).astype(str), 
   # id       = lambda x: x.id.fillna(0.0).astype(int),
   id       = lambda x: x.id.fillna(0.0).astype(str),
   # released = lambda x: x.released.fillna(0.0).astype(int),
   released = lambda x: x.released.fillna(0.0).astype(str),
   label    = lambda x: x.label.astype(str).str[1:]
   )


print("Number of rows: %d" % (len(df_all_nodes1)) )
print("")
display(df_all_nodes3.head(5))

print("--")


# Step 02: Make specific DataFrames

In [None]:

#  Now, populate Persons and Movies

df_persons = df_all_nodes3[df_all_nodes3["label"] == "Person"][["id", "label", "born", "name"]]
   #
df_movies  = df_all_nodes3[df_all_nodes3["label"] == "Movie"][["id", "label", "released", "tagline", "title"]]


display(len(df_persons))
display(df_persons.head(5))
   #
display(len(df_movies))
display(df_movies.head(5))


print("--")


# Step 03: Same, but for edges

In [None]:

#  Load a DataFrame from CSV, Edges

l_InputFile  = "./10_N_MovieDB/25_edges.txt"

df_all_edges1 = pd.read_csv(
   l_InputFile,
   delimiter = ",",
   dtype = {
      # "_start"    : float,
      "_start"    : np.dtype('O'),
      # "_end"      : float, 
      "_end"      : np.dtype('O'),
      "_type"     : np.dtype('O'),
      # "rating"    : float,
      "rating"    : np.dtype('O'),
      "roles"     : np.dtype('O'),
      "summary"   : np.dtype('O')
      })

print("Number of rows: %d" % (len(df_all_edges1)) )
print("")
display(df_all_edges1.head(5))

print("--")


In [None]:

df_all_edges2 = df_all_edges1.rename(columns={"_start": "START_ID", "_end": "END_ID", "_type": "TYPE",
   "rating": "RATING", "roles": "ROLES", "summary": "SUMMARY"})

df_all_edges3 = df_all_edges2.assign(
   START_ID = lambda x: x.START_ID.fillna(0.0).astype(str), 
   END_ID   = lambda x: x.END_ID.fillna(0.0).astype(str),
   RATING   = lambda x: x.RATING.fillna(0.0).astype(str)
   )

print("Number of rows: %d" % (len(df_all_edges3)) )
print("")
display(df_all_edges3.head(5))

print("--")


In [None]:

#  Split out the various edges

df_reviewed = df_all_edges3[df_all_edges3["TYPE"] == "REVIEWED"][["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]]

df_wrote    = df_all_edges3[df_all_edges3["TYPE"] == "WROTE"   ][["START_ID", "END_ID", "TYPE"]]

df_produced = df_all_edges3[df_all_edges3["TYPE"] == "PRODUCED"][["START_ID", "END_ID", "TYPE"]]

df_directed = df_all_edges3[df_all_edges3["TYPE"] == "DIRECTED"][["START_ID", "END_ID", "TYPE"]]

df_follows  = df_all_edges3[df_all_edges3["TYPE"] == "FOLLOWS" ][["START_ID", "END_ID", "TYPE"]]

#  "roles" is a string similar to  '[ "a", "b", "c" ]'
#
#  This was automatically coming in as a list-
#
df_actedin  = df_all_edges3[df_all_edges3["TYPE"] == "ACTED_IN"][["START_ID", "END_ID", "TYPE", "ROLES"]]


display(len(df_reviewed))
display(df_reviewed.head(2))

display(len(df_wrote))
display(df_wrote.head(2))

display(len(df_produced))
display(df_produced.head(2))

display(len(df_directed))
display(df_directed.head(2))

display(len(df_follows))
display(df_follows.head(2))

display(len(df_actedin))
display(df_actedin.head(2))

print("--")


#  Step 04:  Import into graph

In [None]:


from katana_enterprise.remote import import_data

with import_data.DataFrameImporter(my_graph) as df_importer:
    
   # Person
   #
   df_importer.nodes_dataframe(
      df_persons[["id", "label", "born", "name"]],
      id_column  = "id",
      id_space   = "Person"
      )
   #  Movie
   #
   df_importer.nodes_dataframe(
      df_movies[["id", "label", "title", "tagline"]],
      id_column  = "id",
      id_space   = "Movie"
      )  
    
   #  DIRECTED
   #
   df_importer.edges_dataframe(
      df_directed[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "DIRECTED"
      )
   #  REVIEWED
   #
   df_importer.edges_dataframe(
      df_reviewed[["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "REVIEWED"
      )
   #  WROTE
   #
   df_importer.edges_dataframe(
      df_wrote[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "WROTE"
      )
   #  PRODUCED
   #
   df_importer.edges_dataframe(
      df_produced[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "PRODUCED"
      )
   #  FOLLOWS
   #
   df_importer.edges_dataframe(
      df_follows[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "FOLLOWS"
      )
   #  ACTEDIN
   #
   df_importer.edges_dataframe(
      df_actedin[["START_ID", "END_ID", "TYPE", "ROLES"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "ACTEDIN"
      )

   df_importer.insert()

print("--")
    
    

#  Step 00:  Check our work ..

In [None]:

l_result = my_graph.query_unpaginated("""
   MATCH (n) 
   RETURN COUNT(n)
   """)
display(l_result)

l_result = my_graph.query_unpaginated("""
   MATCH (n) - [r] -> (m)
   RETURN COUNT(r)
   """)
display(l_result)


In [None]:

l_result = my_graph.query_unpaginated("""

   MATCH (a) 
   WITH DISTINCT LABELS(a) AS temp, COUNT(a) AS tempCnt
   UNWIND temp AS label
   RETURN label, SUM(tempCnt) AS cnt
   ORDER BY label
   
   """)

display(l_result)


In [None]:

l_result = my_graph.query_unpaginated("""

   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS temp, COUNT(r) AS tempCnt
   RETURN temp, tempCnt
   ORDER BY temp

   """)

display(l_result)


In [None]:

l_result = my_graph.query_unpaginated("""

   MATCH (n) -[r]-> (m)
   RETURN n, r, m
   
   """, contextualize = True)

l_result.view()


# Step 00: Output a graph as a a local file

In [None]:

#  Formatting could use a little work, but the concept is here ..
#

l_result = my_graph.query_unpaginated("""
   MATCH (n: Person) 
   RETURN n
   """)
l_result.to_csv("./10_N_MovieDB/40_Person_Nodes.txt", sep = "|", index = False)

l_result = my_graph.query_unpaginated("""
   MATCH (n: Movie) 
   RETURN n
   """)
l_result.to_csv("./10_N_MovieDB/41_Movie_Nodes.txt", sep = "|", index = False)

   ###
    
l_result = my_graph.query_unpaginated("""
   MATCH (n) - [r: ACTEDIN] -> (m)
   RETURN r
   """)
l_result.to_csv("./10_N_MovieDB/50_ACTEDIN_Edges.txt", sep = "|", index = False)
    
l_result = my_graph.query_unpaginated("""
   MATCH (n) - [r: DIRECTED] -> (m)
   RETURN r
   """)
l_result.to_csv("./10_N_MovieDB/51_DIRECTED_Edges.txt", sep = "|", index = False)
    
l_result = my_graph.query_unpaginated("""
   MATCH (n) - [r: PRODUCED] -> (m)
   RETURN r
   """)
l_result.to_csv("./10_N_MovieDB/52_PRODUCED_Edges.txt", sep = "|", index = False)
    
l_result = my_graph.query_unpaginated("""
   MATCH (n) - [r: REVIEWED] -> (m)
   RETURN r
   """)
l_result.to_csv("./10_N_MovieDB/53_REVIEWED_Edges.txt", sep = "|", index = False)
    
l_result = my_graph.query_unpaginated("""
   MATCH (n) - [r: WROTE] -> (m)
   RETURN r
   """)
l_result.to_csv("./10_N_MovieDB/54_WROTE_Edges.txt", sep = "|", index = False)
    
    
print("--")

