In [None]:

#  Notebook to load the DeepCDR graph ..
#



#  Setup: Display options ..

In [60]:

%xmode Minimal

#  Setting display options 

import pandas as pd
import numpy  as np
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


print("--")


Exception reporting mode: Minimal


--


#  Setup: Connectivity ..

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)
print(my_client.server_version)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:
# ##################################################################

In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ONE SPECIFIC GRAPH ONLY

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      if ((l_database.name == DB_NAME) and (l_graph.name == GRAPH_NAME)):   
         l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
         l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name = l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name = DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPH
#
my_graph = my_client.get_database(name = DB_NAME).create_graph(name = GRAPH_NAME, num_partitions = NUM_PARTITIONS)

print(my_graph)


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:
# ##################################################################

In [None]:

#  Now that we share servers, get a better list of databases and graphs ..
#

for l_each_d in my_client.databases():
   print("Database Name: %-28s   Id: %s" % (l_each_d.name, l_each_d.database_id))
      #
   for l_each_g in my_client.get_database(name=l_each_d.name).graphs_in_database(): 
      print("   Graph Name: %-28s      Version: %-8d   Id: %s" % (l_each_g.name, l_each_g.version, l_each_g.graph_id))


print(""  )
print("--")


In [56]:

display(my_graph.num_nodes())
display(my_graph.num_edges())


          0/? [?op/s]

2666

          0/? [?op/s]

4575807

In [None]:

my_graph.query("CALL graph.schema() RETURN *")


#  Step 01: Process CSV files ..

In [64]:

#  Read the CSVs as Step 01 towards populating our graph
#

l_folder   = "./10_Data/"


#  Helper function, see what's going on 
#
def f_outputsomething(arg1, arg2):
   print("Dataframe: %-32s   Number of records: %d" % (arg2, len(arg1)))   
      #
   print(tabulate(arg1.head(2), headers='keys', tablefmt='psql'))
   print()
    

#  Nodes
#
df_N_cell_line                   = pd.read_csv(l_folder + "21_N_cell_lines.csv"                  , header = "infer", delimiter = ",", dtype = [ ("id", np.unicode), ("tcga_code", np.unicode) ] )
f_outputsomething(df_N_cell_line, "Node, Cell Line")
df_N_gene                        = pd.read_csv(l_folder + "22_N_genes.csv"                       , header = "infer", delimiter = ",", dtype = [ ("id", np.unicode) ] )
f_outputsomething(df_N_gene,      "Node, Gene")
df_N_gdsc                        = pd.read_csv(l_folder + "23_N_gdsc.csv"                        , header = "infer", delimiter = ",", dtype = [ ("id", np.unicode) ] )
f_outputsomething(df_N_gdsc,      "Node, GDSC")
df_N_drug                        = pd.read_csv(l_folder + "24_N_drugs.csv"                       , header = "infer", delimiter = ",", dtype = [ ("id", np.unicode), ("smiles", np.unicode)    ] )
f_outputsomething(df_N_drug,      "Node, Drug")


#  Edges
#
df_E_gdsc_cell_line              = pd.read_csv(l_folder + "31_E_gdsc_cell_line.csv"              ,header = "infer", delimiter = ","
f_outputsomething(df_E_gdsc_cell_line,             "Edge, GDSC Cell Line")
df_E_cell_line_gene_expression   = pd.read_csv(l_folder + "32_E_cell_line_gene_expression.csv"   ,header = "infer", delimiter = ",")
f_outputsomething(df_E_cell_line_gene_expression,  "Edge, Cell Line Expression")
df_E_cell_line_gene_methylation  = pd.read_csv(l_folder + "33_E_cell_line_gene_methylation.csv"  ,header = "infer", delimiter = ",")
f_outputsomething(df_E_cell_line_gene_methylation, "Edge, Cell Line Gene Methylation")
df_E_gdsc_drug                   = pd.read_csv(l_folder + "34_E_gdsc_drug.csv"                   ,header = "infer", delimiter = ",")
f_outputsomething(df_E_gdsc_drug,                  "Edge, GDSC Drug")
df_E_cell_line_gene_mutation     = pd.read_csv(l_folder + "35_E_cell_line_gene_mutation.csv"     ,header = "infer", delimiter = ",")
f_outputsomething(df_E_cell_line_gene_mutation,    "Edge, Cell Line Gene Mutation")

        
print("")
print("--")


Dataframe: Node, Cell Line                    Number of records: 1461
+----+------------+-------------+
|    | id         | tcga_code   |
|----+------------+-------------|
|  0 | ACH-000698 | SCLC        |
|  1 | ACH-000489 | COAD/READ   |
+----+------------+-------------+

Dataframe: Node, Gene                         Number of records: 719
+----+-------+
|    | id    |
|----+-------|
|  0 | AKT2  |
|  1 | EPAS1 |
+----+-------+

Dataframe: Node, GDSC                         Number of records: 266
+----+-----------+
|    | id        |
|----+-----------|
|  0 | GDSC:1    |
|  1 | GDSC:1001 |
+----+-----------+

Dataframe: Node, Drug                         Number of records: 238
+----+---------+------------------------------------------------+
|    |      id | smiles                                         |
|----+---------+------------------------------------------------|
|  0 | 9863776 | COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O |
|  1 |    3385 | O=c1[nH]cc(F)c(=O)[nH]1        

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df_N_cell_line                   = pd.read_csv(l_folder + "21_N_cell_lines.csv"                  , header = "infer", delimiter = ",", dtype = [ ("id", np.unicode), ("tcga_code", np.unicode) ] )
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df_N_gene                        = pd.read_csv(l_folder + "22_N_genes.csv"                       , header = "infer", delimiter = ",", dtype = [ ("id", np.unicode) ] )
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df_N_gdsc                        = pd.read_csv(l_folder + "23_N_gdsc.csv"                        , header = "infer", delimiter = ",", dtype = [ ("id", np.unicode) ] )
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-no

In [50]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_N_cell_line,
      id_column             = "id",
      id_space              = "CELL_LINE",  
      label                 = "CELL_LINE",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gene,
      id_column             = "id",
      id_space              = "GENE",  
      label                 = "GENE",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gdsc,
      id_column             = "id",
      id_space              = "GDSC",  
      label                 = "GDSC",  
      ) 
   df_importer.nodes_dataframe(
      df_N_drug,
      id_column             = "id",
      id_space              = "DRUG",  
      label                 = "DRUG",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_E_gdsc_cell_line, 
      source_id_space       = "GDSC", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_CELL_LINE"
      )
   df_importer.edges_dataframe(
      df_E_gdsc_cell_line, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GDSC",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_CELL_LINE"
      )

   df_importer.edges_dataframe(
      df_E_cell_line_gene_expression, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_EXPRESSION_OBSERVATION"
      )
   df_importer.edges_dataframe(
      df_E_cell_line_gene_expression, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_EXPRESSION_OBSERVATION"
      )
    
   df_importer.edges_dataframe(
      df_E_cell_line_gene_methylation, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_METHYLATION_OBSERVATION"
      )
    
   df_importer.edges_dataframe(
      df_E_gdsc_drug, 
      source_id_space       = "GDSC", 
      destination_id_space  = "DRUG",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "FOR_DRUG"
      )
    
   df_importer.edges_dataframe(
      df_E_cell_line_gene_mutation, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_MUTATION_OBSERVATION"
      )
    
   df_importer.node_id_property_name("id")
   df_importer.insert()


      ###

        
        
        
        
        

print("")
print("--")




          0/? [?op/s]

          0/? [?op/s]


--


In [55]:

l_numnodes = my_graph.num_nodes()
l_numedges = my_graph.num_edges()


print("Number of Nodes in DataFrame: %d" % ( len(df_N_cell_line) +  len(df_N_gene) +  len(df_N_gdsc) +  len(df_N_drug) )  )
print("Number of Nodes in Graph:     %d" % ( l_numnodes ) )

print("Number of Nodes in DataFrame: %d" % (  ( len(df_E_gdsc_cell_line) + len(df_E_cell_line_gene_expression) + len(df_E_cell_line_gene_methylation) + len(df_E_gdsc_drug) + len(df_E_cell_line_gene_mutation) ) * 2 )  )


print("Number of Edges in Graph:     %d" % ( l_numedges ) )


          0/? [?op/s]

          0/? [?op/s]

Number of Nodes in DataFrame: 2684
Number of Nodes in Graph:     2666
Number of Nodes in DataFrame: 2885142
Number of Edges in Graph:     4575807
