#  Setup: Display options

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


# Setup: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPH

my_graph = my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


# Step 01:  Read CSVs (small) ..

In [None]:

#  Read the CSVs as Step 01 towards populating our graph
#

l_folder   = "./10_Data/"


#  Nodes
#
df_PatientVisitNode                       = pd.read_csv(l_folder + "N_01_00_PatientVisit.txt"                   , header = "infer", delimiter = "|")
df_MedicalSpecialtyNode                   = pd.read_csv(l_folder + "N_02_00_MedicalSpecialty.txt"               , header = "infer", delimiter = "|")
df_KeywordNode                            = pd.read_csv(l_folder + "N_03_00_Keyword.txt"                        , header = "infer", delimiter = "|")
   #
df_UmlsEntityNode                         = pd.read_csv(l_folder + "N_04_00_UmlsEntity.txt"                     , header = "infer", delimiter = "|")
df_UmlsVocabularyNode                     = pd.read_csv(l_folder + "N_05_00_UmlsVocabulary.txt"                 , header = "infer", delimiter = "|")

l_nodecnt = (
   len(df_PatientVisitNode    ) + 
   len(df_MedicalSpecialtyNode) + 
   len(df_KeywordNode         ) + 
   len(df_UmlsEntityNode      ) + 
   len(df_UmlsVocabularyNode  ) 
   )


#  Edges
#
df_PatientVisitToMedicalSpecialtyEdge_N   = pd.read_csv(l_folder + "E_01_00_N_PatientVistToMedicalSpecialty.txt", header = "infer", delimiter = "|")
df_PatientVisitToMedicalSpecialtyEdge_S   = pd.read_csv(l_folder + "E_01_00_S_PatientVistToMedicalSpecialty.txt", header = "infer", delimiter = "|")
   #
df_PatientVisitToKeywordEdge_N            = pd.read_csv(l_folder + "E_02_00_N_PatientVistToKeyword.txt"         , header = "infer", delimiter = "|")
df_PatientVisitToKeywordEdge_S            = pd.read_csv(l_folder + "E_02_00_S_PatientVistToKeyword.txt"         , header = "infer", delimiter = "|")

df_PatientVisitToEntityEdge_N             = pd.read_csv(l_folder + "E_03_00_N_PatientVistToEntity.txt"          , header = "infer", delimiter = "|")
df_PatientVisitToEntityEdge_S             = pd.read_csv(l_folder + "E_03_00_S_PatientVistToEntity.txt"          , header = "infer", delimiter = "|")
   #
df_EntityToVocabularyEdge_N               = pd.read_csv(l_folder + "E_04_00_N_PatientVistToVocabulary.txt"      , header = "infer", delimiter = "|")
df_EntityToVocabularyEdge_S               = pd.read_csv(l_folder + "E_04_00_S_PatientVistToVocabulary.txt"      , header = "infer", delimiter = "|")

l_edgecnt = (
   len(df_PatientVisitToMedicalSpecialtyEdge_N) +
   len(df_PatientVisitToMedicalSpecialtyEdge_S) +
   len(df_PatientVisitToKeywordEdge_N         ) +
   len(df_PatientVisitToKeywordEdge_S         ) +
   len(df_PatientVisitToEntityEdge_N          ) +
   len(df_PatientVisitToEntityEdge_S          ) +
   len(df_EntityToVocabularyEdge_N            ) +
   len(df_EntityToVocabularyEdge_S            )
)


print("--")


#  Step 02: Create the Graph (small) ..

In [None]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_PatientVisitNode,
      id_column             = "id",
      id_space              = "PatientVisit",  
      label                 = "PatientVisit",  
      ) 
   df_importer.nodes_dataframe(
      df_MedicalSpecialtyNode,
      id_column             = "id",
      id_space              = "MedicalSpecialty",  
      label                 = "MedicalSpecialty",  
      ) 
   df_importer.nodes_dataframe(
      df_KeywordNode,
      id_column             = "id",
      id_space              = "Keyword",  
      label                 = "Keyword",  
      ) 
   df_importer.nodes_dataframe(
      df_UmlsEntityNode,
      id_column             = "id",
      id_space              = "UmlsEntity",  
      label                 = "UmlsEntity",  
      ) 
   df_importer.nodes_dataframe(
      df_UmlsVocabularyNode,
      id_column             = "id",
      id_space              = "UmlsVocabulary",  
      label                 = "UmlsVocabulary",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_S, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_N, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "UmlsVocabulary",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_S, 
      source_id_space       = "UmlsVocabulary", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )

   df_importer.edges_dataframe(
      df_PatientVisitToMedicalSpecialtyEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "MedicalSpecialty",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_SPECIALTY"
      )
   df_importer.edges_dataframe(
      df_PatientVisitToMedicalSpecialtyEdge_S, 
      source_id_space       = "MedicalSpecialty", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_SPECIALTY"
      )

   df_importer.edges_dataframe(
      df_PatientVisitToKeywordEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "Keyword",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_KEYWORD"
      )
   df_importer.edges_dataframe(
      df_PatientVisitToKeywordEdge_S, 
      source_id_space       = "Keyword", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "IS_OF_KEYWORD"
      )
    
   df_importer.node_id_property_name("id")
   df_importer.insert()


      ###


print("")
print("")
   #
print("Number of nodes in frame: %d   Numbers of edges in frame: %d" % ( l_nodecnt,            l_edgecnt            ))
print("Number of nodes in graph: %d   Numbers of edges in graph: %d" % ( my_graph.num_nodes(), my_graph.num_edges() ))
   #
print("")
print("--")


#  Sample ouptut,
#
#     Number of nodes in frame: 75703   Numbers of edges in frame: 1145176
#     
#     Number of nodes in graph: 75703   Numbers of edges in graph: 1145176


#  Step 00:  Checking Results

In [None]:

#  List all node types with node counts

l_query  = """
   MATCH (n) 
   WITH DISTINCT LABELS(n) AS temp, COUNT(n) AS tempCnt
   UNWIND temp AS label
   RETURN label, SUM(tempCnt) AS cnt
   ORDER BY label
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers='keys', tablefmt='psql'))

#  Sample output,
#
#     +----+------------------+-------+
#     |    | label            |   cnt |
#     |----+------------------+-------|
#     |  0 | Keyword          | 10443 |
#     |  1 | MedicalSpecialty |    40 |
#     |  2 | PatientVisit     |  4999 |
#     |  3 | UmlsEntity       | 20073 |
#     |  4 | UmlsVocabulary   | 40148 |
#     +----+------------------+-------+


In [None]:

#  List all vertice types with counts

l_query  = """
   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS temp, COUNT(r) AS tempCnt
   RETURN temp, tempCnt
   ORDER BY temp
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers='keys', tablefmt='psql'))

#  Sample output,
#
#  +----+-----------------+-----------+
#  |    | temp            |   tempCnt |
#  |----+-----------------+-----------|
#  |  0 | ALSO_CODED_AS   |    103750 |
#  |  1 | IS_OF_KEYWORD   |    105676 |
#  |  2 | IS_OF_SPECIALTY |      9998 |
#  |  3 | VISIT_CONTAINS  |    925752 |
#  +----+-----------------+-----------+


In [None]:

#  Produce a visual graph

from katana_visualization_widget import GraphVisOptions, NodeVisOption, EdgeVisOption, ANY

l_options = GraphVisOptions(
   node_options = [
      NodeVisOption("PatientVisit",     label="sample_name"      ),
      NodeVisOption("UmlsEntity",       label="id"               ),
      NodeVisOption("UmlsVocabulary",   label="id"               ),
      NodeVisOption("Keyword",          label="keyword"          ),
      NodeVisOption("MedicalSpecialty", label="medical_specialty"),
   ],
   #  edge_options = [
   #     EdgeVisOption(["VISIT_CONTAINS"], label="start_id"),
   #     EdgeVisOption(["ALSO_CODED_AS" ], label="start_id"),
   #  ]
   )
    
    
l_result = my_graph.query_unpaginated("""

   MATCH (n) - [r] -> (m)
   RETURN n, r, m
   LIMIT 1000
   
   """,
   contextualize=True)

l_result.view(graph_vis_options = l_options)


<div> 
<img src="./01_Images/Results_01.png" alt="Drawing" style="width: 800px;"/>
</div>

In [None]:

#  Produce a visual graph

from katana_visualization_widget import GraphVisOptions, NodeVisOption, EdgeVisOption, ANY

l_options = GraphVisOptions(
   node_options = [
      NodeVisOption("PatientVisit",     label="sample_name"      ),
      NodeVisOption("UmlsEntity",       label="id"               ),
      NodeVisOption("UmlsVocabulary",   label="id"               ),
      NodeVisOption("Keyword",          label="keyword"          ),
      NodeVisOption("MedicalSpecialty", label="medical_specialty"),
   ],
   #  edge_options = [
   #     EdgeVisOption(["VISIT_CONTAINS"], label="start_id"),
   #     EdgeVisOption(["ALSO_CODED_AS" ], label="start_id"),
   #  ]
   )
    
    
l_result = my_graph.query_unpaginated("""

   MATCH (pv: PatientVisit) - [vc: VISIT_CONTAINS] -> (ue: UmlsEntity) - [aca: ALSO_CODED_AS] -> (uv: UmlsVocabulary),
      (ms: MedicalSpecialty) <- [ios: IS_OF_SPECIALTY] - (pv: PatientVisit) - [iok: IS_OF_KEYWORD] -> (k: Keyword)
   WHERE
      (pv.id = 'PV-4001')
   RETURN pv, vc, ue, ms, ios, iok, k
   
   """,
   contextualize=True)

l_result.view(graph_vis_options = l_options)


<div> 
<img src="./01_Images/Results_02.png" alt="Drawing" style="width: 800px;"/>
</div>

In [None]:
# ################################################################

In [None]:
# ################################################################

# Step 00:  Read CSVs (larger) ..

In [None]:

#  More UmlsEntity data ..
#
#  This cell takes 1-2 minutes to run ..
#

import glob
   #
import pandas as pd
import numpy as np


l_df = [ pd.read_csv(
   l_file,
   delimiter = "|",
   dtype = {
      "id"              : np.dtype("O"),
      "entity_id"       : np.dtype("O"),
      "LABEL"           : np.dtype("O"),
      "preferred_term"  : np.dtype("O"),
      }
   ) for l_file in glob.glob("./10_Data/N_14_*.txt") ]
      #
df_nodes = pd.concat(l_df, ignore_index = True)


print("Number of rows from file: %d" % (len(df_nodes)) )
print("")
   #
print(df_nodes.head(5))
print("")
print("Importing into graph ..")
print("")


with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_nodes,
      id_column             = "id",
      id_space              = "UmlsEntity",  
      label                 = "UmlsEntity",  
      ) 
   df_importer.insert()

print("--")


#  Sample output,
#
#     Number of rows from file: 4661936
#     
#                   id      entity_id       LABEL    preferred_term
#     0  UMLS/C3753082  UMLS/C3753082  UmlsEntity    bacterium WB-1
#     1  UMLS/C3753083  UMLS/C3753083  UmlsEntity   bacterium WB-54
#     2  UMLS/C3753084  UMLS/C3753084  UmlsEntity  bacterium WFR-69
#     3  UMLS/C3753085  UMLS/C3753085  UmlsEntity  bacterium WFR-71
#     4  UMLS/C3753086  UMLS/C3753086  UmlsEntity  bacterium WGT-31
#     
#     Importing into graph ..
#     
#     --


In [None]:

#  List all node types with node counts

l_query  = """
   MATCH (n) 
   WITH DISTINCT LABELS(n) AS temp, COUNT(n) AS tempCnt
   UNWIND temp AS label
   RETURN label, SUM(tempCnt) AS cnt
   ORDER BY label
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers='keys', tablefmt='psql'))


#  Sample output, before larger load,
#
#     +----+------------------+-------+
#     |    | label            |   cnt |
#     |----+------------------+-------|
#     |  0 | Keyword          | 10443 |
#     |  1 | MedicalSpecialty |    40 |
#     |  2 | PatientVisit     |  4999 |
#     |  3 | UmlsEntity       | 20073 |
#     |  4 | UmlsVocabulary   | 40148 |
#     +----+------------------+-------+

#  Sample output, after the larger load,
#
#     +----+------------------+---------+
#     |    | label            |     cnt |
#     |----+------------------+---------|
#     |  0 | Keyword          |   10443 |
#     |  1 | MedicalSpecialty |      40 |
#     |  2 | PatientVisit     |    4999 |
#     |  3 | UmlsEntity       | 4661972 |
#     |  4 | UmlsVocabulary   |   40148 |
#     +----+------------------+---------+

