In [None]:
#  Notebook to load CSV into graph using DataFrame method
#
#  .  Data is the Neo4J Movie graph
#  .  Incidentally includes teaching methods relative to Dask DataFrame manipulation
#
#  .  I haven't fully checked all results below; there may be issues


In [1]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f1163d670d0>


In [2]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")

--


In [None]:
# ##################################################################
#
#  This section; basic graph and database setup, reset for test


In [3]:
#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")

--


In [None]:
#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)

In [None]:
#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)

In [4]:
#  CREATE A GRAPH

my_graph=my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, BWHhUifKmgMQ9PFnb5WTQv9NGmd3YkK2BRit2vG64f5S, 0>


In [None]:
#  CONNECT TO GRAPH

for l_graph in my_client.get_database(name=DB_NAME).graphs_in_database():
   if (l_graph.name == GRAPH_NAME):
      my_graph=my_client.get_database(name=DB_NAME).get_graph_by_id(id=l_graph.graph_id)
         #
      break

# my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)

In [None]:
display(my_graph.num_nodes())
display(my_graph.num_edges())

In [None]:
# ##################################################################
#
#  Load from source CSV, in this case we are using the Neo4J Movie graph


In [5]:
import dask.dataframe as dd
import numpy as np

print("--")


--


In [6]:
#  Load a DataFrame from CSV, Nodes/Vertices

l_InputFile  = "./10_NMovieDB/24_nodes.txt"

df_all_nodes1 = dd.read_csv(
   l_InputFile,
   delimiter = ",",
   dtype = {
      "id"        : int,
      "_labels"   : np.dtype('O'),
      "born"      : float, 
      "name"      : np.dtype('O'),
      "released"  : float,
      "tagline"   : np.dtype('O'),
      "title"     : np.dtype('O')
      })

print("--")


--


In [None]:
#  Number of rows, two different ways

display(len(df_all_nodes1))
display(print("{} Rows".format(df_all_nodes1.shape[0].compute())))

#  Other output

display(df_all_nodes1.head(10))
display(df_all_nodes1[["born", "name"]].head(10))

In [None]:
#  Print all rows, subsetted columns

for l_each in df_all_nodes1.iterrows():
   print(l_each[0], "   ", l_each[1]["_labels"], "   ", l_each[1]["name"])  


In [None]:
# ##################################################################
#
#   https://www.askpython.com/python/examples/subset-a-dataframe
#   https://www.codegrepper.com/code-examples/python/convert+float+to+int+python+pandas
#   https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.assign.html


In [None]:
#  We have some issues we want to change with our data
#
#  .  Some of the property names have a leading underscore. Change those.
#  .  Some values which should be integer, are float.
#  .  The label values are currently ";Person" and ";Movie". Let's remove those semicolons.


In [7]:
#  Column renames

df_all_nodes2 = df_all_nodes1.rename(columns={"_id": "id", "_labels": "label"})

print("--")

--


In [8]:
#  Change float values to integer, remove leading semicolon from label

df_all_nodes3 = df_all_nodes2.assign(
   born     = lambda x: x.born.fillna(0.0).astype(int), 
   id       = lambda x: x.id.fillna(0.0).astype(int),
   released = lambda x: x.released.fillna(0.0).astype(int),
   label    = lambda x: x.label.astype(str).str[1:]
   )

print("--")

--


In [9]:
display(len(df_all_nodes3))
display(df_all_nodes3.head(10))

171

Unnamed: 0,id,label,born,name,released,tagline,title
0,0,Movie,0,,1999,Welcome to the Real World,The Matrix
1,1,Person,1964,Keanu Reeves,0,,
2,2,Person,1967,Carrie-Anne Moss,0,,
3,3,Person,1961,Laurence Fishburne,0,,
4,4,Person,1960,Hugo Weaving,0,,
5,5,Person,1967,Lilly Wachowski,0,,
6,6,Person,1965,Lana Wachowski,0,,
7,7,Person,1952,Joel Silver,0,,
8,8,Person,1978,Emil Eifrem,0,,
9,9,Movie,0,,2003,Free your mind,The Matrix Reloaded


In [None]:
# ##################################################################

In [10]:
#  Now, populate Persons and Movies

df_persons = df_all_nodes3[df_all_nodes3["label"] == "Person"][["id", "label", "born", "name"]]

df_movies  = df_all_nodes3[df_all_nodes3["label"] == "Movie"][["id", "label", "released", "tagline", "title"]]

print("--")

--


In [None]:

display(len(df_persons))
display(df_persons.head(10))
display(len(df_movies))
display(df_movies.head(10))


In [None]:
# ##################################################################
#
#  Repeat the style of work from above, but now for edges
#
#  .  Some of the property names have a leading underscore. Change those.
#  .  Some values which should be integer, are float.


In [11]:
#  Load a DataFrame from CSV, Edges

l_InputFile  = "./10_NMovieDB/25_edges.txt"

df_all_edges1 = dd.read_csv(
   l_InputFile,
   delimiter = ",",
   dtype = {
      "_start"    : float,
      "_end"      : float, 
      "_type"     : np.dtype('O'),
      "rating"    : float,
      "roles"     : np.dtype('O'),
      "summary"   : np.dtype('O')
      })

print("--")

--


In [None]:
display(len(df_all_edges1))
display(df_all_edges1.head(10))


In [12]:
df_all_edges2 = df_all_edges1.rename(columns={"_start": "START_ID", "_end": "END_ID", "_type": "TYPE",
   "rating": "RATING", "roles": "ROLES", "summary": "SUMMARY"})

df_all_edges3 = df_all_edges2.assign(
   START_ID = lambda x: x.START_ID.fillna(0.0).astype(int), 
   END_ID   = lambda x: x.END_ID.fillna(0.0).astype(int),
   RATING   = lambda x: x.RATING.fillna(0.0).astype(int)
   )

print("--")

--


In [None]:
display(len(df_all_edges3))
display(df_all_edges3.head(10))

In [13]:
#  Split out the various edges

df_reviewed = df_all_edges3[df_all_edges3["TYPE"] == "REVIEWED"][["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]]

df_wrote    = df_all_edges3[df_all_edges3["TYPE"] == "WROTE"   ][["START_ID", "END_ID", "TYPE"]]

df_produced = df_all_edges3[df_all_edges3["TYPE"] == "PRODUCED"][["START_ID", "END_ID", "TYPE"]]

df_directed = df_all_edges3[df_all_edges3["TYPE"] == "DIRECTED"][["START_ID", "END_ID", "TYPE"]]

df_follows  = df_all_edges3[df_all_edges3["TYPE"] == "FOLLOWS" ][["START_ID", "END_ID", "TYPE"]]

#  "roles" is a string similar to  '[ "a", "b", "c" ]'
#
#  This was automatically coming in as a list-
#  Cool

df_actedin  = df_all_edges3[df_all_edges3["TYPE"] == "ACTED_IN"][["START_ID", "END_ID", "TYPE", "ROLES"]]

print("--")

--


In [None]:

display(len(df_reviewed))
display(df_reviewed.head(2))

display(len(df_wrote))
display(df_wrote.head(2))

display(len(df_produced))
display(df_produced.head(2))

display(len(df_directed))
display(df_directed.head(2))

display(len(df_follows))
display(df_follows.head(2))

display(len(df_actedin))
display(df_actedin.head(2))


In [None]:
# ##################################################################

In [None]:
#  Didn't need this; also don't know if it had any effect

# from dask.distributed import Client
# client = Client(n_workers=4, threads_per_worker=2)

# print("--")

In [14]:
# ##################################################################
#
#  Finally now, load the vertices/nodes into a graph
#
#  Some hinkiness we need to work around ..
#
#     .  The Dask DataFrames here were loaded from CSV, and those CSV
#        files were found, in scope.
#        The KG DataFrame importer will reference that same file
#        pathname, and the file will not be in scope. Basically,
#        it was expected that these files be on S3/GS all along.
#        I hate to have that dependency because, just one more thing
#        to have to manage.
#
#     .  So, we'll copy the DataFrames to Dask arrays, then back into
#        a Dask DataFrame.
#        Why not just copy the DaskDataFrame ?  Currently there is only 
#        shallow copies of DataFrames.
#
#  See,
#     https://stackoverflow.com/questions/52119342/how-do-i-convert-a-dask-dataframe-into-a-dask-array
#     https://docs.dask.org/en/latest/generated/dask.dataframe.from_dask_array.html


da_persons    = df_persons.to_dask_array()
da_movies     = df_movies.to_dask_array()
   #
da_directed   = df_directed.to_dask_array()
da_reviewed   = df_reviewed.to_dask_array()
da_wrote      = df_wrote.to_dask_array()
da_produced   = df_produced.to_dask_array()
da_follows    = df_follows.to_dask_array()
da_actedin    = df_actedin.to_dask_array()


df_persons2   = dd.io.from_dask_array(da_persons,  columns=["id", "label", "born", "name"]).compute()
df_movies2    = dd.io.from_dask_array(da_movies,   columns=["id", "label", "released", "tagline", "title"]).compute()
   #
df_directed2  = dd.io.from_dask_array(da_directed, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_reviewed2  = dd.io.from_dask_array(da_reviewed, columns=["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]).compute()
df_wrote2     = dd.io.from_dask_array(da_wrote, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_produced2  = dd.io.from_dask_array(da_produced, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_follows2   = dd.io.from_dask_array(da_follows, columns=["START_ID", "END_ID", "TYPE"]).compute()
df_actedin2   = dd.io.from_dask_array(da_actedin, columns=["START_ID", "END_ID", "TYPE", "ROLES"]).compute()

print("--")



--


In [15]:
from katana_enterprise.remote import import_data

print("--")


--


In [16]:

with import_data.DataFrameImporter(my_graph) as df_importer:
    
   # Person
   #
   df_importer.nodes_dataframe(
      df_persons2[["id", "label", "born", "name"]],
      id_column  = "id",
      id_space   = "Person"
      )
   #  Movie
   #
   df_importer.nodes_dataframe(
      df_movies2[["id", "label", "title", "tagline"]],
      id_column  = "id",
      id_space   = "Movie"
      )  
    
   #  DIRECTED
   #
   df_importer.edges_dataframe(
      df_directed2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "DIRECTED"
      )
   #  REVIEWED
   #
   df_importer.edges_dataframe(
      df_reviewed2[["START_ID", "END_ID", "TYPE", "RATING", "SUMMARY"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "REVIEWED"
      )
   #  WROTE
   #
   df_importer.edges_dataframe(
      df_wrote2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "WROTE"
      )
   #  PRODUCED
   #
   df_importer.edges_dataframe(
      df_produced2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "PRODUCED"
      )
   #  FOLLOWS
   #
   df_importer.edges_dataframe(
      df_follows2[["START_ID", "END_ID", "TYPE"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "FOLLOWS"
      )
   #  ACTEDIN
   #
   df_importer.edges_dataframe(
      df_actedin2[["START_ID", "END_ID", "TYPE", "ROLES"]],
      source_id_space      = "Person",
      destination_id_space = "Movie",
      source_column        = "START_ID",
      destination_column   = "END_ID",
      type                 = "ACTEDIN"
      )

   df_importer.execute()
    
    



          0/? [?op/s]


Host 0 errors:
Traceback (most recent call last):
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/worker/worker.py", line 86, in execute
    value = function(graph)
  File "/opt/conda/lib/python3.8/site-packages/katana_enterprise/remote/dataframe_importer/__init__.py", line 299, in remote_write_intermediate_parquet_files
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/remote/dataframe_importer/__init__.py", line 136, in _write_intermediate_parquet_files
    node_header = "\n".join(_write_dfs_to_parquet(node_df_infos, "nodes"))
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/remote/dataframe_importer/__init__.py", line 130, in _write_dfs_to_parquet
    dask.dataframe.to_parquet(
  File "/opt/miniconda/lib/python3.8/site-packages/dask/dataframe/io/parquet/core.py", line 963, in to_parquet
    out = out.compute(**compute_kwargs)
  File "/opt/miniconda/lib/python3.8/site-packages/dask/base.py", line 315, in compute
    (result

ValueError: Failed to convert partition to expected pyarrow schema:
    `ArrowTypeError("Expected bytes, got a 'int' object", 'Conversion failed for column id with type object')`

Expected partition schema:
    id: string
    label: string
    born: string
    name: string

Received partition schema:
    id: int64
    label: string
    born: int64
    name: string

This error *may* be resolved by passing in schema information for
the mismatched column(s) using the `schema` keyword in `to_parquet`.

In [None]:

l_result1 = my_graph.query("""

   MATCH (a) 
   WITH DISTINCT LABELS(a) AS temp, COUNT(a) AS tempCnt
   UNWIND temp AS label
   RETURN label, SUM(tempCnt) AS cnt
   ORDER BY label
   
   """)

display(print(l_result1))


In [None]:

l_result1 = my_graph.query("""

   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS temp, COUNT(r) AS tempCnt
   RETURN temp, tempCnt
   ORDER BY temp

   """)

display(print(l_result1))


In [17]:

result = my_graph.query("""

   MATCH (x) -[r]-> (a)
   RETURN x, r AS rel, a
   
   """,
   contextualize=True)

result.view()


          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…