In [None]:

#  Notebook to load the DeepCDR graph ..
#



#  Setup: Display options ..

In [1]:

%xmode Minimal

#  Setting display options 

import pandas as pd
import numpy  as np
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


print("--")


Exception reporting mode: Minimal


--


#  Setup: Connectivity ..

In [2]:

from katana import remote

my_client = remote.Client()

print(my_client)
print(my_client.server_version)


<katana.remote.sync_wrappers.Client object at 0x7fad1875f6a0>
0.9.0+20230315T145956Z.dbe1b6788.dev


In [3]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


--


In [None]:
# ##################################################################

In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [62]:

#  DELETE ONE SPECIFIC GRAPH ONLY

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name = l_database.name).graphs_in_database():
      if ((l_database.name == DB_NAME) and (l_graph.name == GRAPH_NAME)):   
         l_handle = my_client.get_database(name = l_database.name).get_graph_by_id(id = l_graph.graph_id)
         l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


GRAPH ID:  229orVsMaf2pRjWaNumXo1GdhCA4wRMptymSZxGG6Tyz       GRAPH Version:  11
--


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name = l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name = DB_NAME)

print(my_database.database_id)


In [5]:

#  CREATE GRAPH
#
my_graph = my_client.get_database(name = DB_NAME).create_graph(name = GRAPH_NAME, num_partitions = NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, J15pRudKS2xx7F7v2P6Ae5uKiB4G8ui6AmkY7XhGGK7h, 0>


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


In [None]:
# ##################################################################

In [6]:

#  Now that we share servers, get a better list of databases and graphs ..
#

for l_each_d in my_client.databases():
   print("Database Name: %-28s   Id: %s" % (l_each_d.name, l_each_d.database_id))
      #
   for l_each_g in my_client.get_database(name=l_each_d.name).graphs_in_database(): 
      print("   Graph Name: %-28s      Version: %-8d   Id: %s" % (l_each_g.name, l_each_g.version, l_each_g.graph_id))


print(""  )
print("--")


Database Name: default                        Id: 5RNXXsxxurFyGoT1bfGu1o58RxjgpHdGi2SPV6UBH9Xu
Database Name: my_db                          Id: A1W7oi2f14CGMdaCaRrUDZUrFrUWSp67yZ9n22SxRfe5
   Graph Name: my_graph_BB                       Version: 48         Id: 7JfcadZu8CmgPwwCfzQ92FXjh7QZYsHRToFJDtLSgPMd
   Graph Name: my_graph                          Version: 0          Id: J15pRudKS2xx7F7v2P6Ae5uKiB4G8ui6AmkY7XhGGK7h

--


#  Step 01: Process CSV files ..

In [64]:

#  Read the CSVs as Step 01 towards populating our graph
#

from src import utils

l_folder   = "./10_Data/"

#  Helper function, see what's going on 
#
def f_outputsomething(arg1, arg2, arg3, arg4):
   if (arg4):
      print("Dataframe... %-44s   Number of records: %d" % (arg2, (len(arg1) * 2) ))   
   else:
      print("Dataframe... %-44s   Number of records: %d" % (arg2, (len(arg1)    ) ))   
   if (arg3):
      print(tabulate(arg1.head(2), headers='keys', tablefmt='psql'))
      print()
    

#  Nodes
#
df_N_cell_line                   = pd.read_csv(l_folder + "21_N_cell_lines.csv"                  , header = "infer", delimiter = ",", dtype = [ ("id", str), ("tcga_code", str) ] )
f_outputsomething(df_N_cell_line, "Node, Cell Line",                                   True, False)
df_N_gene                        = pd.read_csv(l_folder + "22_N_genes.csv"                       , header = "infer", delimiter = ",", dtype = [ ("id", str) ] )
f_outputsomething(df_N_gene,      "Node, Gene",                                        True, False)
df_N_gdsc                        = pd.read_csv(l_folder + "23_N_gdsc.csv"                        , header = "infer", delimiter = ",", dtype = [ ("id", str) ] )
f_outputsomething(df_N_gdsc,      "Node, GDSC",                                        True, False)
df_N_drug                        = pd.read_csv(l_folder + "24_N_drugs.csv"                       , header = "infer", delimiter = ",", dtype = [ ("id", str), ("smiles", str)    ] )
f_outputsomething(df_N_drug,      "Node, Drug",                                        True, False)


print()
print()


#  Edges
#

#  Needed a dropna() here, else data would not load
#
df_E_gdsc_cell_line              = pd.read_csv(l_folder + "31_E_gdsc_cell_line.csv"              , header = "infer", delimiter = ",", dtype = [ ("START_ID", str), ("END_ID", str), ("LABEL", str) ] ).dropna()
f_outputsomething(df_E_gdsc_cell_line,             "Edge, GDSC Cell Line",             True, False)

df_E_cell_line_gene_expression   = pd.read_csv(l_folder + "32_E_cell_line_gene_expression.csv"   , header = "infer", delimiter = ",", dtype = [ ("START_ID", str), ("END_ID", str), ("OBSERVATION", np.double) ] )
f_outputsomething(df_E_cell_line_gene_expression,  "Edge, Cell Line Gene Expression",  True, False)

df_E_cell_line_gene_methylation  = pd.read_csv(l_folder + "33_E_cell_line_gene_methylation.csv"  , header = "infer", delimiter = ",", dtype = [ ("START_ID", str), ("END_ID", str), ("OBSERVATION", np.double) ] )
f_outputsomething(df_E_cell_line_gene_methylation, "Edge, Cell Line Gene Methylation", True, False)

df_E_gdsc_drug                   = pd.read_csv(l_folder + "34_E_gdsc_drug.csv"                   , header = "infer", delimiter = ",", dtype = [ ("START_ID", str), ("END_ID", str) ] )
f_outputsomething(df_E_gdsc_drug,                  "Edge, GDSC Drug",                  True, False)

df_E_cell_line_gene_mutation     = pd.read_csv(l_folder + "35_E_cell_line_gene_mutation.csv"     , header = "infer", delimiter = ",", dtype = [ ("START_ID", str), ("END_ID", str), ("OBSERVATION", np.double) ] )
f_outputsomething(df_E_cell_line_gene_mutation,    "Edge, Cell Line Gene Mutation",    True, False)

        
print("")
print("--")


Dataframe... Node, Cell Line                                Number of records: 1461
+----+------------+-------------+
|    | id         | tcga_code   |
|----+------------+-------------|
|  0 | ACH-000698 | SCLC        |
|  1 | ACH-000489 | COAD/READ   |
+----+------------+-------------+

Dataframe... Node, Gene                                     Number of records: 719
+----+-------+
|    | id    |
|----+-------|
|  0 | AKT2  |
|  1 | EPAS1 |
+----+-------+

Dataframe... Node, GDSC                                     Number of records: 266
+----+-----------+
|    | id        |
|----+-----------|
|  0 | GDSC:1    |
|  1 | GDSC:1001 |
+----+-----------+

Dataframe... Node, Drug                                     Number of records: 238
+----+---------+------------------------------------------------+
|    |      id | smiles                                         |
|----+---------+------------------------------------------------|
|  0 | 9863776 | COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)

In [65]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_N_cell_line,
      id_column             = "id",
      id_space              = "CELL_LINE",  
      label                 = "CELL_LINE",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gene,
      id_column             = "id",
      id_space              = "GENE",  
      label                 = "GENE",  
      ) 
   df_importer.nodes_dataframe(
      df_N_gdsc,
      id_column             = "id",
      id_space              = "GDSC",  
      label                 = "GDSC",  
      ) 
   df_importer.nodes_dataframe(
      df_N_drug,
      id_column             = "id",
      id_space              = "DRUG",  
      label                 = "DRUG",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_E_gdsc_cell_line, 
      source_id_space       = "GDSC", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_CELL_LINE"
      )
   df_importer.edges_dataframe(
      df_E_gdsc_cell_line, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GDSC",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_CELL_LINE"
      )

   df_importer.edges_dataframe(
      df_E_cell_line_gene_expression, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_EXPRESSION_OBSERVATION"
      )
   df_importer.edges_dataframe(
      df_E_cell_line_gene_expression, 
      source_id_space       = "GENE", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_EXPRESSION_OBSERVATION"
      )
    
   df_importer.edges_dataframe(
      df_E_cell_line_gene_methylation, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_METHYLATION_OBSERVATION"
      )
   df_importer.edges_dataframe(
      df_E_cell_line_gene_methylation, 
      source_id_space       = "GENE", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_METHYLATION_OBSERVATION"
      )
    
   df_importer.edges_dataframe(
      df_E_gdsc_drug, 
      source_id_space       = "GDSC", 
      destination_id_space  = "DRUG",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "FOR_DRUG"
      )
   df_importer.edges_dataframe(
      df_E_gdsc_drug, 
      source_id_space       = "DRUG", 
      destination_id_space  = "GDSC",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "FOR_DRUG"
      )
    
   df_importer.edges_dataframe(
      df_E_cell_line_gene_mutation, 
      source_id_space       = "CELL_LINE", 
      destination_id_space  = "GENE",   
      source_column         = "START_ID",
      destination_column    = "END_ID",
      type                  = "HAS_MUTATION_OBSERVATION"
      )
   df_importer.edges_dataframe(
      df_E_cell_line_gene_mutation, 
      source_id_space       = "GENE", 
      destination_id_space  = "CELL_LINE",   
      source_column         = "END_ID",
      destination_column    = "START_ID",
      type                  = "HAS_MUTATION_OBSERVATION"
      )
    
   df_importer.node_id_property_name("id")
   df_importer.insert()

print("")
print("--")


          0/? [?op/s]

          0/? [?op/s]


--


In [66]:

#  Check all of the above, runs counts
#

l_query  = """
   MATCH (n) 
   WITH DISTINCT LABELS(n) AS temp, COUNT(n) AS tempCnt
   UNWIND temp AS label
   RETURN label, SUM(tempCnt) AS cnt
   ORDER BY label
   """.format()
      #
l_result1 = my_graph.query_unpaginated(l_query)

l_query  = """
   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS label, COUNT(r) AS cnt
   RETURN label, cnt
   ORDER BY label
   """.format()
      #
l_result2 = my_graph.query_unpaginated(l_query)

print()

f_outputsomething(df_N_cell_line,                  "Node, Cell Line"                 , False, False)
f_outputsomething(df_N_drug,                       "Node, Drug"                      , False, False)
f_outputsomething(df_N_gdsc,                       "Node, GDSC"                      , False, False)
f_outputsomething(df_N_gene,                       "Node, Gene"                      , False, False)
   #
print()
   #
for l_each in l_result1.itertuples():
   print("From Graph.. Node, %-38s   Number of records: %d" % (l_each.label, l_each.cnt))   

print()
print()

f_outputsomething(df_E_gdsc_drug,                  "Edge, GDSC Drug"                 , False, True )
f_outputsomething(df_E_gdsc_cell_line,             "Edge, GDSC Cell Line"            , False, True )
f_outputsomething(df_E_cell_line_gene_expression,  "Edge, Cell Line Gene Expression" , False, True )
f_outputsomething(df_E_cell_line_gene_methylation, "Edge, Cell Line Gene Methylation", False, True )
f_outputsomething(df_E_cell_line_gene_mutation,    "Edge, Cell Line Gene Mutation"   , False, True )
   #
print()
   #
for l_each in l_result2.itertuples():
   print("From Graph.. Edge, %-38s   Number of records: %d" % (l_each.label, l_each.cnt))   

print()
print("--")


          0/? [?op/s]

          0/? [?op/s]


Dataframe... Node, Cell Line                                Number of records: 1461
Dataframe... Node, Drug                                     Number of records: 238
Dataframe... Node, GDSC                                     Number of records: 266
Dataframe... Node, Gene                                     Number of records: 719

From Graph.. Node, CELL_LINE                                Number of records: 1458
From Graph.. Node, DRUG                                     Number of records: 223
From Graph.. Node, GDSC                                     Number of records: 266
From Graph.. Node, GENE                                     Number of records: 719


Dataframe... Edge, GDSC Drug                                Number of records: 532
Dataframe... Edge, GDSC Cell Line                           Number of records: 417468
Dataframe... Edge, Cell Line Gene Expression                Number of records: 782034
Dataframe... Edge, Cell Line Gene Methylation               Number of recor

In [None]:

#  Output the schema from our graph
#

l_result = my_graph.query_unpaginated("CALL graph.schema() RETURN *")
print(l_result)

print()
print("--")
