In [None]:

#  Notebook to load the DeepCDR graph ..
#



#  Setup: Display options ..

In [None]:

%xmode Minimal

#  Setting display options 

import pandas as pd
import numpy  as np
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


print("--")


#  Setup: Connectivity ..

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)
print(my_client.server_version)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:
# ##################################################################

In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ONE SPECIFIC GRAPH ONLY

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      if ((l_database.name == DB_NAME) and (l_graph.name == GRAPH_NAME)):   
         l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
         l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name = l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name = DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPH
#
my_graph = my_client.get_database(name = DB_NAME).create_graph(name = GRAPH_NAME, num_partitions = NUM_PARTITIONS)

print(my_graph)


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:
# ##################################################################

In [None]:

#  Now that we share servers, get a better list of databases and graphs ..
#

for l_each_d in my_client.databases():
   print("Database Name: %-28s   Id: %s" % (l_each_d.name, l_each_d.database_id))
      #
   for l_each_g in my_client.get_database(name=l_each_d.name).graphs_in_database(): 
      print("   Graph Name: %-28s      Version: %-8d   Id: %s" % (l_each_g.name, l_each_g.version, l_each_g.graph_id))


print(""  )
print("--")


#  Step 01: Process CSV files ..

In [None]:

#  Read the CSVs as Step 01 towards populating our graph
#


l_folder   = "./10_Data/"
   #
l_nodes    = [
   { "l_file": "21_drug.txt.gz", "l_label": "
22_gdsc.txt.gz
23_cell_line.txt.gz
24_gene.txt.gz


df_N_gdsc                        = pd.read_csv(l_folder + "23_N_gdsc.csv"                        , header = "infer", delimiter = ",", dtype = [ ("id", str) ] )
f_outputsomething(df_N_gdsc,      "Node, GDSC",                                        True, False)
        
print("")
print("--")
    
    
    
31_gdsc_drug.txt.gz
32_gdsc_cell_line.txt.gz
33_cell_line_gene_expression.txt.gz
34_cell_line_gene_methylation.txt.gz
41_cell_line_gene_mutation.txt.gz
42_cell_line_gene_mutation.txt.gz
43_cell_line_gene_mutation.txt.gz
44_cell_line_gene_mutation.txt.gz
45_cell_line_gene_mutation.txt.gz
46_cell_line_gene_mutation.txt.gz
47_cell_line_gene_mutation.txt.gz
48_cell_line_gene_mutation.txt.gz
49_cell_line_gene_mutation.txt.gz
50_cell_line_gene_mutation.txt.gz
51_cell_line_gene_mutation.txt.gz
52_cell_line_gene_mutation.txt.gz
53_cell_line_gene_mutation.txt.gz
54_cell_line_gene_mutation.txt.gz
55_cell_line_gene_mutation.txt.gz
56_cell_line_gene_mutation.txt.gz
57_cell_line_gene_mutation.txt.gz
58_cell_line_gene_mutation.txt.gz
59_cell_line_gene_mutation.txt.gz
60_cell_line_gene_mutation.txt.gz
61_cell_line_gene_mutation.txt.gz
62_cell_line_gene_mutation.txt.gz
63_cell_line_gene_mutation.txt.gz
64_cell_line_gene_mutation.txt.gz
65_cell_line_gene_mutation.txt.gz
    
    
    

In [None]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_N_cell_line,
      id_column             = "id",
      id_space              = "CELL_LINE",  
      label                 = "CELL_LINE",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gene,
      id_column             = "id",
      id_space              = "GENE",  
      label                 = "GENE",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gdsc,
      id_column             = "id",
      id_space              = "GDSC",  
      label                 = "GDSC",  
      ) 
   df_importer.nodes_dataframe(
      df_N_drug,
      id_column             = "id",
      id_space              = "DRUG",  
      label                 = "DRUG",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_E_gdsc_cell_line, 
      source_id_space       = "GDSC", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_CELL_LINE"
      )
   df_importer.edges_dataframe(
      df_E_gdsc_cell_line, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GDSC",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_CELL_LINE"
      )

   df_importer.edges_dataframe(
      df_E_cell_line_gene_expression, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_EXPRESSION_OBSERVATION"
      )
   df_importer.edges_dataframe(
      df_E_cell_line_gene_expression, 
      source_id_space       = "GENE", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_EXPRESSION_OBSERVATION"
      )
    
   df_importer.edges_dataframe(
      df_E_cell_line_gene_methylation, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_METHYLATION_OBSERVATION"
      )
   df_importer.edges_dataframe(
      df_E_cell_line_gene_methylation, 
      source_id_space       = "GENE", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_METHYLATION_OBSERVATION"
      )
    
   df_importer.edges_dataframe(
      df_E_gdsc_drug, 
      source_id_space       = "GDSC", 
      destination_id_space  = "DRUG",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "FOR_DRUG"
      )
   df_importer.edges_dataframe(
      df_E_gdsc_drug, 
      source_id_space       = "DRUG", 
      destination_id_space  = "GDSC",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "FOR_DRUG"
      )
    
   df_importer.edges_dataframe(
      df_E_cell_line_gene_mutation, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_MUTATION_OBSERVATION"
      )
   df_importer.edges_dataframe(
      df_E_cell_line_gene_mutation, 
      source_id_space       = "GENE", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_MUTATION_OBSERVATION"
      )
    
   df_importer.node_id_property_name("id")
   df_importer.insert()

print("")
print("--")


In [None]:

#  Check all of the above, runs counts
#

l_query  = """
   MATCH (n) 
   WITH DISTINCT LABELS(n) AS temp, COUNT(n) AS tempCnt
   UNWIND temp AS label
   RETURN label, SUM(tempCnt) AS cnt
   ORDER BY label
   """.format()
      #
l_result1 = my_graph.query_unpaginated(l_query)

l_query  = """
   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS label, COUNT(r) AS cnt
   RETURN label, cnt
   ORDER BY label
   """.format()
      #
l_result2 = my_graph.query_unpaginated(l_query)

print()

f_outputsomething(df_N_cell_line,                  "Node, Cell Line"                 , False, False)
f_outputsomething(df_N_drug,                       "Node, Drug"                      , False, False)
f_outputsomething(df_N_gdsc,                       "Node, GDSC"                      , False, False)
f_outputsomething(df_N_gene,                       "Node, Gene"                      , False, False)
   #
print()
   #
for l_each in l_result1.itertuples():
   print("From Graph.. Node, %-38s   Number of records: %d" % (l_each.label, l_each.cnt))   

print()
print()

f_outputsomething(df_E_gdsc_drug,                  "Edge, GDSC Drug"                 , False, True )
f_outputsomething(df_E_gdsc_cell_line,             "Edge, GDSC Cell Line"            , False, True )
f_outputsomething(df_E_cell_line_gene_expression,  "Edge, Cell Line Gene Expression" , False, True )
f_outputsomething(df_E_cell_line_gene_methylation, "Edge, Cell Line Gene Methylation", False, True )
f_outputsomething(df_E_cell_line_gene_mutation,    "Edge, Cell Line Gene Mutation"   , False, True )
   #
print()
   #
for l_each in l_result2.itertuples():
   print("From Graph.. Edge, %-38s   Number of records: %d" % (l_each.label, l_each.cnt))   

print()
print("--")


In [None]:

#  Output the schema from our graph
#

l_result = my_graph.query_unpaginated("CALL graph.schema() RETURN *")
print(l_result)

print()
print("--")
