In [None]:

#  Notebook to load the DeepCDR graph ..
#



#  Setup: Display options ..

In [None]:

%xmode Minimal

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


print("--")


#  Setup: Connectivity ..

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)
print(my_client.server_version)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:
# ##################################################################

In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ONE SPECIFIC GRAPH ONLY

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      if ((l_database.name == DB_NAME) and (l_graph.name == GRAPH_NAME)):   
         l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
         l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name = l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name = DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPH
#
my_graph = my_client.get_database(name = DB_NAME).create_graph(name = GRAPH_NAME, num_partitions = NUM_PARTITIONS)

print(my_graph)


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:
# ##################################################################

In [None]:

#  Now that we share servers, get a better list of databases and graphs ..
#

for l_each_d in my_client.databases():
   print("Database Name: %-28s   Id: %s" % (l_each_d.name, l_each_d.database_id))
      #
   for l_each_g in my_client.get_database(name=l_each_d.name).graphs_in_database(): 
      print("   Graph Name: %-28s      Version: %-8d   Id: %s" % (l_each_g.name, l_each_g.version, l_each_g.graph_id))


print(""  )
print("--")


In [None]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


In [None]:

my_graph.query("CALL graph.schema() RETURN *")


#  Step 01: Process CSV files ..

In [None]:

#  Read the CSVs as Step 01 towards populating our graph
#

l_folder   = "./10_Data/"


#  Helper function, see what's going on 
#
def f_outputsomething(arg1, arg2):
   print("Dataframe: %-32s   Number of records: %d" % (arg2, len(arg1)))   
      #
   print(tabulate(arg1.head(2), headers='keys', tablefmt='psql'))
   print()
    

#  Nodes
#
df_N_cell_line                   = pd.read_csv(l_folder + "21_N_cell_lines.csv"                  , header = "infer", delimiter = ",")
f_outputsomething(df_N_cell_line, "Node, Cell Line")
df_N_gene                        = pd.read_csv(l_folder + "22_N_genes.csv"                       , header = "infer", delimiter = ",")
f_outputsomething(df_N_gene,      "Node, Gene")
df_N_gdsc                        = pd.read_csv(l_folder + "23_N_gdsc.csv"                        , header = "infer", delimiter = ",")
f_outputsomething(df_N_gdsc,      "Node, GDSC")
df_N_drug                        = pd.read_csv(l_folder + "24_N_drugs.csv"                       , header = "infer", delimiter = ",")
f_outputsomething(df_N_drug,      "Node, Drug")


#  Edges
#

df_E_gdsc_cell_line              = pd.read_csv(l_folder + "31_E_gdsc_cell_line.csv"              ,header = "infer", delimiter = ",")
f_outputsomething(df_E_gdsc_cell_line,             "Edge, GDSC Cell Line")
df_E_cell_line_gene_expression   = pd.read_csv(l_folder + "32_E_cell_line_gene_expression.csv"   ,header = "infer", delimiter = ",")
f_outputsomething(df_E_cell_line_gene_expression,  "Edge, Cell Line Expression")
df_E_cell_line_gene_methylation  = pd.read_csv(l_folder + "33_E_cell_line_gene_methylation.csv"  ,header = "infer", delimiter = ",")
f_outputsomething(df_E_cell_line_gene_methylation, "Edge, Cell Line Gene Methylation")
df_E_gdsc_drug                   = pd.read_csv(l_folder + "34_E_gdsc_drug.csv"                   ,header = "infer", delimiter = ",")
f_outputsomething(df_E_gdsc_drug,                  "Edge, GDSC Drug")
df_E_cell_line_gene_mutation     = pd.read_csv(l_folder + "35_E_cell_line_gene_mutation.csv"     ,header = "infer", delimiter = ",")
f_outputsomething(df_E_cell_line_gene_mutation,    "Edge, Cell Line Gene Mutation")

        
print("")
print("--")


In [31]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_N_cell_line,
      id_column             = "id",
      id_space              = "CellLine",  
      label                 = "CellLine",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gene,
      id_column             = "id",
      id_space              = "Gene",  
      label                 = "Gene",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gdsc,
      id_column             = "id",
      id_space              = "GDSC",  
      label                 = "GDSC",  
      ) 
   df_importer.nodes_dataframe(
      df_N_drug,
      id_column             = "id",
      id_space              = "Drug",  
      label                 = "Drug",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_S, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_N, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "UmlsVocabulary",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_S, 
      source_id_space       = "UmlsVocabulary", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )

   df_importer.edges_dataframe(
      df_PatientVisitToMedicalSpecialtyEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "MedicalSpecialty",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_SPECIALTY"
      )
   df_importer.edges_dataframe(
      df_PatientVisitToMedicalSpecialtyEdge_S, 
      source_id_space       = "MedicalSpecialty", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_SPECIALTY"
      )

   df_importer.edges_dataframe(
      df_PatientVisitToKeywordEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "Keyword",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_KEYWORD"
      )
   df_importer.edges_dataframe(
      df_PatientVisitToKeywordEdge_S, 
      source_id_space       = "Keyword", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_KEYWORD"
      )
    
   df_importer.node_id_property_name("id")
   df_importer.insert()


      ###


print("")
print("")
   #
print("Number of nodes in frame: %d   Numbers of edges in frame: %d" % ( l_nodecnt,            l_edgecnt            ))
print("Number of nodes in graph: %d   Numbers of edges in graph: %d" % ( my_graph.num_nodes(), my_graph.num_edges() ))
   #
print("")
print("--")


#  Sample ouptut,
#
#     Number of nodes in frame: 75703   Numbers of edges in frame: 1145176
#     
#     Number of nodes in graph: 75703   Numbers of edges in graph: 1145176


NameError: name 'df_PatientVisitNode' is not defined