In [None]:

#  Notebook with a number of diagnostic/test routines ..
#



#  Setup: Display options ..

In [1]:

%xmode Minimal

#  Setting display options 

import pandas as pd
import numpy  as np
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


print("--")


Exception reporting mode: Minimal


--


#  Setup: Connectivity ..

In [2]:

from katana import remote

my_client = remote.Client()

print(my_client)
print(my_client.server_version)


<katana.remote.sync_wrappers.Client object at 0x7f6ea0269160>
0.9.0+20230315T145956Z.dbe1b6788.dev


In [3]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


--


In [4]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


<_Graph my_graph, 974BNYMBeYB9K781hP55Qqpn2UXUpm6MN1EeV3Q2w99Y, 1>


In [5]:

#  Now that we share servers, get a better list of databases and graphs ..
#

for l_each_d in my_client.databases():
   print("Database Name: %-28s   Id: %s" % (l_each_d.name, l_each_d.database_id))
      #
   for l_each_g in my_client.get_database(name=l_each_d.name).graphs_in_database(): 
      print("   Graph Name: %-28s      Version: %-8d   Id: %s" % (l_each_g.name, l_each_g.version, l_each_g.graph_id))


print(""  )
print("--")
   
#  Sample output,
#
#     Database Name: my_db                          Id: 3YyFKVuT2gAKxY3G4Va8Ya6iS2PGqFzrgFnBrDfLQVnr
#        Graph Name: my_graph_BC                       Version: 1          Id: 26UMAq28pADQpiEhUUVKdFLQoA35skqkxqNXATdmmKHA
#        Graph Name: my_graph_BB                       Version: 10         Id: 8uMk1KV4nYYYttCpuFCPwJmYqqcghxFrg7aZUBT1RBQ5
#        Graph Name: my_graph                          Version: 3          Id: EVW1P2kCKvso8wrkChZF1jr25YumM78N3X97EFW693n9
#     Database Name: default                        Id: GXSdMSrsNymmP9Rr1C8Qir9G81kHSD5HVChvkr8DrTEN



Database Name: my_db                          Id: 3YyFKVuT2gAKxY3G4Va8Ya6iS2PGqFzrgFnBrDfLQVnr
   Graph Name: my_graph_BC                       Version: 1          Id: 26UMAq28pADQpiEhUUVKdFLQoA35skqkxqNXATdmmKHA
   Graph Name: my_graph_BB                       Version: 10         Id: 8uMk1KV4nYYYttCpuFCPwJmYqqcghxFrg7aZUBT1RBQ5
   Graph Name: my_graph                          Version: 3          Id: EVW1P2kCKvso8wrkChZF1jr25YumM78N3X97EFW693n9
Database Name: default                        Id: GXSdMSrsNymmP9Rr1C8Qir9G81kHSD5HVChvkr8DrTEN

--


#  Step 00: Checking our results ..

In [41]:

#  Check counts
#

l_query  = """

   MATCH (n) 
   WITH DISTINCT LABELS(n) AS labels, COUNT(n) AS cnt
   UNWIND labels AS label
   RETURN label, SUM(cnt) AS cnt
   ORDER BY label
   
   """.format()
      #
l_result1 = my_graph.query_unpaginated(l_query)

l_query  = """

   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS type, COUNT(r) AS cnt
   RETURN type, cnt
   ORDER BY type
   
   """.format()
      #
l_result2 = my_graph.query_unpaginated(l_query)


print()

for l_each in l_result1.itertuples():
   print("Node, %-38s   Number of records in graph: %-8d" % (l_each.label, l_each.cnt) )

print()

for l_each in l_result2.itertuples():
   print("Edge, %-38s   Number of records in graph: %-8d" % (l_each.type, l_each.cnt) )   

print()


print("--")


#  Sample output,    (without bi-directional edges)
#
#     Node, CELL_LINE                                Number of records in graph: 1457    
#     Node, DRUG                                     Number of records in graph: 223     
#     Node, GDSC                                     Number of records in graph: 266     
#     Node, GENE                                     Number of records in graph: 719     
#     
#     Edge, FOR_DRUG                                 Number of records in graph: 238     
#     Edge, HAS_CELL_LINE                            Number of records in graph: 255360  
#     Edge, HAS_EXPRESSION_OBSERVATION               Number of records in graph: 391017  
#     Edge, HAS_METHYLATION_OBSERVATION              Number of records in graph: 426500  
#     Edge, HAS_MUTATION_OBSERVATION                 Number of records in graph: 12422852

#  Sample output,    (with bi-directional edges)
#
#     Node, CELL_LINE                                Number of records in graph: 1457    
#     Node, DRUG                                     Number of records in graph: 223     
#     Node, GDSC                                     Number of records in graph: 266     
#     Node, GENE                                     Number of records in graph: 719     
#     
#     Edge, FOR_DRUG                                 Number of records in graph: 476     
#     Edge, HAS_CELL_LINE                            Number of records in graph: 510720  
#     Edge, HAS_EXPRESSION_OBSERVATION               Number of records in graph: 782034  
#     Edge, HAS_METHYLATION_OBSERVATION              Number of records in graph: 853000  
#     Edge, HAS_MUTATION_OBSERVATION                 Number of records in graph: 24845704



          0/? [?op/s]

          0/? [?op/s]


Node, CELL_LINE                                Number of records in graph: 1457    
Node, DRUG                                     Number of records in graph: 223     
Node, GDSC                                     Number of records in graph: 266     
Node, GENE                                     Number of records in graph: 719     

Edge, FOR_DRUG                                 Number of records in graph: 476     
Edge, HAS_CELL_LINE                            Number of records in graph: 510720  
Edge, HAS_EXPRESSION_OBSERVATION               Number of records in graph: 782034  
Edge, HAS_METHYLATION_OBSERVATION              Number of records in graph: 853000  
Edge, HAS_MUTATION_OBSERVATION                 Number of records in graph: 24845704

--


In [7]:

#  Output the schema from our graph
#

l_result = my_graph.query_unpaginated("CALL graph.schema() RETURN *")
   #
print(l_result)

print()
print("--")

#  Sample output
#
#     0                                           [CELL_LINE]         id,label,tcga_code
#     1   ([GENE]::[HAS_EXPRESSION_OBSERVATION])  [CELL_LINE]           TYPE,observation
#     2  ([GENE]::[HAS_METHYLATION_OBSERVATION])  [CELL_LINE]           TYPE,observation
#     3     ([GENE]::[HAS_MUTATION_OBSERVATION])  [CELL_LINE]  TYPE,mutation,observation
#     4                                                [DRUG]            id,label,smiles
#     5                                                [GDSC]                   id,label
#     6                     ([DRUG]::[FOR_DRUG])       [GDSC]                       TYPE
#     7           ([CELL_LINE]::[HAS_CELL_LINE])       [GDSC]                 TYPE,label
#     8                                                [GENE]                         id



          0/? [?op/s]

                                  neighbor     nodeType                 properties
0                                           [CELL_LINE]         id,label,tcga_code
1   ([GENE]::[HAS_EXPRESSION_OBSERVATION])  [CELL_LINE]           TYPE,observation
2  ([GENE]::[HAS_METHYLATION_OBSERVATION])  [CELL_LINE]           TYPE,observation
3     ([GENE]::[HAS_MUTATION_OBSERVATION])  [CELL_LINE]  TYPE,mutation,observation
4                                                [DRUG]            id,label,smiles
5                                                [GDSC]                   id,label
6                     ([DRUG]::[FOR_DRUG])       [GDSC]                       TYPE
7           ([CELL_LINE]::[HAS_CELL_LINE])       [GDSC]                 TYPE,label
8                                                [GENE]                         id

--


In [11]:

#  List all node types with node counts

l_query  = """

   MATCH (n) 
   WITH DISTINCT LABELS(n) AS labels, COUNT(n) AS cnt
   UNWIND labels AS label
   RETURN label, SUM(cnt) AS cnt
   ORDER BY label
   
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))


print()


#  List all edge types with counts

l_query  = """

   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS type, COUNT(r) AS cnt
   RETURN type, cnt
   ORDER BY type
   
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))


#  Sample output
#
#     +-----------+-------+
#     | label     |   cnt |
#     |-----------+-------|
#     | CELL_LINE |  1457 |
#     | DRUG      |   223 |
#     | GDSC      |   266 |
#     | GENE      |   719 |
#     +-----------+-------+
#     
#     +-----------------------------+----------+
#     | type                        |      cnt |
#     |-----------------------------+----------|
#     | FOR_DRUG                    |      238 |
#     | HAS_CELL_LINE               |   255360 |
#     | HAS_EXPRESSION_OBSERVATION  |   391017 |
#     | HAS_METHYLATION_OBSERVATION |   426500 |
#     | HAS_MUTATION_OBSERVATION    | 12422852 |
#     +-----------------------------+----------+



          0/? [?op/s]

+-----------+-------+
| label     |   cnt |
|-----------+-------|
| CELL_LINE |  1457 |
| DRUG      |   223 |
| GDSC      |   266 |
| GENE      |   719 |
+-----------+-------+



          0/? [?op/s]

+-----------------------------+----------+
| type                        |      cnt |
|-----------------------------+----------|
| FOR_DRUG                    |      238 |
| HAS_CELL_LINE               |   255360 |
| HAS_EXPRESSION_OBSERVATION  |   391017 |
| HAS_METHYLATION_OBSERVATION |   426500 |
| HAS_MUTATION_OBSERVATION    | 12422852 |
+-----------------------------+----------+


In [45]:

#  Return properties/keys from all vertices

l_query  = """
   MATCH (n)
   RETURN DISTINCT LABELS(n) AS label, KEYS(n) AS my_keys, COUNT(n) AS cnt
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))


#  Return properties/keys from all edges
#  

l_query  = """
   MATCH (n) - [r] -> (m)
   RETURN DISTINCT TYPE(r) AS my_type, KEYS(r) AS my_keys, COUNT(r) AS cnt
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))



          0/? [?op/s]

+---------------+--------------------------------------------------------------------------------------------------+-------+
| label         | my_keys                                                                                          |   cnt |
|---------------+--------------------------------------------------------------------------------------------------+-------|
| ['CELL_LINE'] | ['genomics_expression', 'genomics_methylation', 'genomics_mutation', 'id', 'label', 'tcga_code'] |   344 |
| ['DRUG']      | ['id', 'label', 'smiles']                                                                        |   223 |
| ['GDSC']      | ['id', 'label']                                                                                  |   266 |
| ['CELL_LINE'] | ['genomics_expression', 'genomics_methylation', 'genomics_mutation', 'id', 'label']              |    14 |
| ['GENE']      | ['id']                                                                                           |   719 |


          0/? [?op/s]

+-----------------------------+-------------------------------------+----------+
| my_type                     | my_keys                             |      cnt |
|-----------------------------+-------------------------------------+----------|
| FOR_DRUG                    | ['TYPE']                            |      476 |
| HAS_CELL_LINE               | ['TYPE']                            |    37400 |
| HAS_EXPRESSION_OBSERVATION  | ['TYPE', 'observation']             |   499052 |
| HAS_METHYLATION_OBSERVATION | ['TYPE', 'observation']             |   545560 |
| HAS_MUTATION_OBSERVATION    | ['TYPE', 'mutation', 'observation'] | 24776358 |
| HAS_CELL_LINE               | ['TYPE', 'label']                   |   153056 |
+-----------------------------+-------------------------------------+----------+


#  Step 00:  Using DuckDB

In [23]:

#  Using DuckDB.  See,
#     https://duckdb.org/docs/api/python/overview.html
#

#  Using DuckDB from a Pandas dataframe ..
#

df_airports = pd.DataFrame([
       #
   ["MKE", "Milwaukee"               , "Airport"],
   ["ORD", "Chicago O-Hare"          , "Airport"],
   ["SJC", "San Jose"                , "Airport"],
   ["DEN", "Denver"                  , "Airport"],
       #
   ], columns = ["airport_code", "airport_name", "LABEL"])
      #
print(tabulate(df_airports, headers = "keys", tablefmt = "psql", showindex = False))


   ###
    
import duckdb

l_return = duckdb.sql("SELECT airport_code, airport_name, UPPER(LABEL) FROM df_airports ORDER BY airport_code LIMIT 3")
   #
print(l_return)


print("--")


+----------------+----------------+---------+
| airport_code   | airport_name   | LABEL   |
|----------------+----------------+---------|
| MKE            | Milwaukee      | Airport |
| ORD            | Chicago O-Hare | Airport |
| SJC            | San Jose       | Airport |
| DEN            | Denver         | Airport |
+----------------+----------------+---------+
┌──────────────┬────────────────┬────────────────┐
│ airport_code │  airport_name  │ upper("LABEL") │
│   varchar    │    varchar     │    varchar     │
├──────────────┼────────────────┼────────────────┤
│ DEN          │ Denver         │ AIRPORT        │
│ MKE          │ Milwaukee      │ AIRPORT        │
│ ORD          │ Chicago O-Hare │ AIRPORT        │
└──────────────┴────────────────┴────────────────┘

--


In [40]:

#  Using DuckDB on a KGIP traversal result set ..
#

l_query  = """
   MATCH (n: DRUG)
   RETURN id(n) AS id, n.smiles AS smiles, n.label AS label
   LIMIT 10
   """.format()
      #
l_result = my_graph.query_unpaginated(l_query)

duckdb.sql("SELECT *, LENGTH(smiles) AS my_len FROM l_result ORDER BY id DESC")
           
           
#  Sample output,
#
#     ┌─────────────────┬────────────────────────────────────────────────────────────────┬─────────┬────────┐
#     │       id        │                             smiles                             │  label  │ my_len │
#     │      int64      │                            varchar                             │ varchar │ int64  │
#     ├─────────────────┼────────────────────────────────────────────────────────────────┼─────────┼────────┤
#     │ 562949953421314 │ Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S(N)(=O)=O        │ DRUG    │     55 │
#     │ 562949953421313 │ N#CC(c1ccnc(NCCc2cccnc2)n1)c1nc2ccccc2s1                       │ DRUG    │     40 │
#     │ 562949953421312 │ CCNC(=O)C1=C(c2ccc(CN3CCOCC3)cc2)C(=C2C=C(C(C)C)C(O)=CC2=O)ON1 │ DRUG    │     62 │
#     │ 281474976710658 │ O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1I)c1cccs1           │ DRUG    │     52 │
#     │ 281474976710657 │ Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nc(-c2cccnc2)cs1     │ DRUG    │     58 │
#     │ 281474976710656 │ NCC(=O)Nc1ccc(-n2nc(C(F)(F)F)cc2-c2ccc3c(ccc4ccccc43)c2)cc1    │ DRUG    │     59 │
#     │               3 │ O=C(NC1CCNCC1)c1[nH]ncc1NC(=O)c1c(Cl)cccc1Cl                   │ DRUG    │     44 │
#     │               2 │ Cc1cc(N2CCOCC2)cc2c1NC(=C1C(=O)N=CC=C1NCC(O)c1cccc(Cl)c1)N2    │ DRUG    │     59 │
#     │               1 │ COc1ccc(-c2cc3nccn3c(Nc3ncccc3C(N)=O)n2)cc1OC                  │ DRUG    │     45 │
#     │               0 │ Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4c[nH]c5ccccc45)c3)cc12          │ DRUG    │     53 │
#     ├─────────────────┴────────────────────────────────────────────────────────────────┴─────────┴────────┤
#     #     │ 10 rows                                                                                   4 columns │
#     └─────────────────────────────────────────────────────────────────────────────────────────────────────┘



          0/? [?op/s]

┌─────────────────┬────────────────────────────────────────────────────────────────┬─────────┬────────┐
│       id        │                             smiles                             │  label  │ my_len │
│      int64      │                            varchar                             │ varchar │ int64  │
├─────────────────┼────────────────────────────────────────────────────────────────┼─────────┼────────┤
│ 562949953421314 │ Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S(N)(=O)=O        │ DRUG    │     55 │
│ 562949953421313 │ N#CC(c1ccnc(NCCc2cccnc2)n1)c1nc2ccccc2s1                       │ DRUG    │     40 │
│ 562949953421312 │ CCNC(=O)C1=C(c2ccc(CN3CCOCC3)cc2)C(=C2C=C(C(C)C)C(O)=CC2=O)ON1 │ DRUG    │     62 │
│ 281474976710658 │ O=C(NCCCNc1nc(Nc2cccc(NC(=O)N3CCCC3)c2)ncc1I)c1cccs1           │ DRUG    │     52 │
│ 281474976710657 │ Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nc(-c2cccnc2)cs1     │ DRUG    │     58 │
│ 281474976710656 │ NCC(=O)Nc1ccc(-n2nc(C(F)(F)F)cc2-c2ccc3c(ccc