#  Setup: Display options

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


# Setup: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPH

my_graph = my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


# Step 01:  Read CSVs ..

In [None]:


#  Read the CSVs as Step 01 towards populating our graph
#

l_folder   = "./10_Data/"


#  Nodes
#
df_PatientVisitNodes                       = pd.read_csv(l_folder + "N_01_00_PatientVisit.txt"                   , header = "infer", delimiter = "|")
df_MedicalSpecialtyNodes                   = pd.read_csv(l_folder + "N_02_00_MedicalSpecialty.txt"               , header = "infer", delimiter = "|")
df_KeywordNodes                            = pd.read_csv(l_folder + "N_03_00_Keyword.txt"                        , header = "infer", delimiter = "|")
   #
df_UmlsEntityNodes                         = pd.read_csv(l_folder + "N_04_00_UmlsEntity.txt"                     , header = "infer", delimiter = "|")
df_UmlsVocabularyNodes                     = pd.read_csv(l_folder + "N_05_00_UmlsVocabulary.txt"                 , header = "infer", delimiter = "|")


#  Edges
#
df_PatientVisitToMedicalSpecialtyEdges_N   = pd.read_csv(l_folder + "E_01_00_N_PatientVistToMedicalSpecialty.txt", header = "infer", delimiter = "|")
df_PatientVisitToMedicalSpecialtyEdges_S   = pd.read_csv(l_folder + "E_01_00_S_PatientVistToMedicalSpecialty.txt", header = "infer", delimiter = "|")
   #
df_PatientVisitToKeywordEdges_N            = pd.read_csv(l_folder + "E_02_00_N_PatientVistToKeyword.txt"         , header = "infer", delimiter = "|")
df_PatientVisitToKeywordEdges_S            = pd.read_csv(l_folder + "E_02_00_S_PatientVistToKeyword.txt"         , header = "infer", delimiter = "|")

df_PatientVisitToEntityEdge_N              = pd.read_csv(l_folder + "E_03_00_N_PatientVistToEntity.txt"          , header = "infer", delimiter = "|")
df_PatientVisitToEntityEdge_S              = pd.read_csv(l_folder + "E_03_00_S_PatientVistToEntity.txt"          , header = "infer", delimiter = "|")
   #
df_PatientVisitToVocabularyEdge_N          = pd.read_csv(l_folder + "E_04_00_N_PatientVistToVocabulary.txt"      , header = "infer", delimiter = "|")
df_PatientVisitToVocabularyEdge_S          = pd.read_csv(l_folder + "E_04_00_S_PatientVistToVocabulary.txt"      , header = "infer", delimiter = "|")


print("--")


In [None]:

#  Sample data from above,
#
#     200
#     
#     {
#        "entityMentions": [
#           {
#              "mentionId": "1",
#              "type": "MEDICINE",
#              "text": {
#                 "content": "Insulin regimen",
#                 "beginOffset": 0
#              },
#              "linkedEntities": [
#                 {
#                    "entityId": "UMLS/C0021641"
#                 },
#                 {
#                    "entityId": "UMLS/C0795635"
#                 },
#                 {
#                    "entityId": "UMLS/C1533581"
#                 },
#                 {
#                    "entityId": "UMLS/C3537244"
#                 },
#                 {
#                    "entityId": "UMLS/C3714501"
#                 }
#              ],
#              "temporalAssessment": {
#                 "value": "CURRENT",
#                 "confidence": 0.8573660850524902
#              },
#              "certaintyAssessment": {
#                 "value": "LIKELY",
#                 "confidence": 0.9751282930374146
#              },
#              "subject": {
#                 "value": "PATIENT",
#                 "confidence": 0.9995787739753723
#              },
#              "confidence": 0.6379408836364746
#           },
#           {
#              "mentionId": "2",
#              "type": "MED_DOSE",
#              "text": {
#                 "content": "5 units",
#                 "beginOffset": 22
#              },
#              "confidence": 0.7443782091140747
#           },
#           {
#              "mentionId": "3",
#              "type": "MED_ROUTE",
#              "text": {
#                 "content": "IV",
#                 "beginOffset": 30
#              },
#              "linkedEntities": [
#                 {
#                    "entityId": "UMLS/C0348016"
#                 }
#              ],
#              "confidence": 0.779011607170105
#           }
#        ],
#        "entities": [
#           {
#              "entityId": "UMLS/C0021641",
#              "preferredTerm": "Insulin",
#              "vocabularyCodes": [
#                 "FMA/83365",
#                 "LNC/LA15805-7",
#                 "LNC/LP14676-8",
#                 "LNC/LP16325-0",
#                 "LNC/LP32542-0",
#                 "LNC/LP70329-5",
#                 "LNC/MTHU002108",
#                 "LNC/MTHU019392",
#                 "MSH/D007328",
#                 "MTH/NOCODE"
#              ]
#           },
#           {
#              "entityId": "UMLS/C0348016",
#              "preferredTerm": "Intravenous",
#              "vocabularyCodes": [
#                 "LNC/LA9437-0",
#                 "LNC/LP32453-0",
#                 "MTH/NOCODE",
#                 "NCI/C13346"
#              ]
#           },
#           {
#              "entityId": "UMLS/C0795635",
#              "preferredTerm": "insulin, regular, human",
#              "vocabularyCodes": [
#                 "LNC/LP17001-6",
#                 "MSH/D061386",
#                 "MTH/NOCODE",
#                 "NCI/C29125",
#                 "RXNORM/253182",
#                 "VANDF/4017559",
#                 "VANDF/4017569",
#                 "VANDF/4019786"
#              ]
#           },
#           {
#              "entityId": "UMLS/C1533581",
#              "preferredTerm": "Therapeutic Insulin",
#              "vocabularyCodes": [
#                 "MTH/NOCODE",
#                 "NCI/C581"
#              ]
#           },
#           {
#              "entityId": "UMLS/C3537244",
#              "preferredTerm": "Insulins",
#              "vocabularyCodes": [
#                 "MSH/D061385",
#                 "MTH/NOCODE"
#              ]
#           },
#           {
#              "entityId": "UMLS/C3714501",
#              "preferredTerm": "Insulin Drug Class",
#              "vocabularyCodes": [
#                 "MTH/NOCODE",
#                 "VANDF/4021631"
#              ]
#           }
#        ],
#        "relationships": [
#           {
#              "subjectId": "1",
#              "objectId": "2",
#              "confidence": 0.9996469616889954
#           },
#           {
#              "subjectId": "1",
#              "objectId": "3",
#              "confidence": 0.9995671510696411
#           }
#        ]
#     }


In [None]:


import glob, os


l_folder   = "./10_Data"
l_files    = glob.glob(os.path.join(l_folder, "34_33_PlusGoogle.*.txt"))
   #
l_enriched = []


for l_each in l_files:
   l_dataframe = pd.read_csv(l_each, header = "infer", delimiter = "|")
   l_enriched.append(l_dataframe)
    
    
df_enriched = pd.concat(l_enriched, ignore_index = True)



print("--")






#  Step 02: Create the Graph ..

In [None]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_PatientVisitNodes,
      id_column             = "id",
      id_space              = "PatientVisit",  
      label                 = "PatientVisit",  
      ) 
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_MedicalSpecialtyNodes,
      id_column             = "id",
      id_space              = "MedicalSpecialty",  
      label                 = "PatientVisit",  
      ) 
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_KeywordNodes,
      id_column             = "id",
      id_space              = "PatientVisit",  
      label                 = "PatientVisit",  
      ) 
   df_importer.insert()
      #
        
        
        
        
        
        
        
        
        
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_UmlsEntityNodes,
      id_column             = "id",
      id_space              = "UmlsEntity",  
      label                 = "UmlsEntity",  
      ) 
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_UmlsVocabularyNodes,
      id_column             = "id",
      id_space              = "UmlsVocabulary",  
      label                 = "UmlsVocabulary",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.node_id_property_name("id")                        #  This line is required when you are not inserting any Nodes, only Edges
   df_importer.insert()                                           #  A heterogeneous relationship, we need two inserts for each edge, because
      #                                                           #  the source and dest id spaces are different.
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_S, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.node_id_property_name("id")
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_N, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "UmlsVocabulary",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )
   df_importer.node_id_property_name("id")
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_S, 
      source_id_space       = "UmlsVocabulary", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )
   df_importer.node_id_property_name("id")
   df_importer.insert()


      ###


display("Number of nodes: %d   Numbers of edges: %d" % ( my_graph.num_nodes(), my_graph.num_edges() ))


display("--")


#  Sample output,
#






#  Step 00:  Checking Results

In [None]:

#  Produce a visual graph

from katana_visualization_widget import GraphVisOptions, NodeVisOption, EdgeVisOption, ANY

l_options = GraphVisOptions(
   node_options = [
      NodeVisOption("PatientVisit",   label="sample_name"),
      NodeVisOption("UmlsEntity",     label="id"         ),
      NodeVisOption("UmlsVocabulary", label="id"         ),
   ],
   #  edge_options = [
   #     EdgeVisOption(["VISIT_CONTAINS"], label="start_id"),
   #     EdgeVisOption(["ALSO_CODED_AS" ], label="start_id"),
   #  ]
   )
    
    
l_result = my_graph.query("""

   MATCH (n) - [r] -> (m)
   RETURN n, r, m
   LIMIT 1000 
   
   """,
   contextualize=True)

l_result.view(graph_vis_options = l_options)


<div> 
<img src="./01_Images/Results_01.png" alt="Drawing" style="width: 800px;"/>
</div>

In [None]:

#  Produce a visual graph

from katana_visualization_widget import GraphVisOptions, NodeVisOption, EdgeVisOption, ANY

l_options = GraphVisOptions(
   node_options = [
      NodeVisOption("PatientVisit",   label="sample_name"),
      NodeVisOption("UmlsEntity",     label="id"         ),
      NodeVisOption("UmlsVocabulary", label="id"         ),
   ],
   #  edge_options = [
   #     EdgeVisOption(["VISIT_CONTAINS"], label="start_id"),
   #     EdgeVisOption(["ALSO_CODED_AS" ], label="start_id"),
   #  ]
   )
    
    
l_result = my_graph.query("""

   MATCH (n: PatientVisit) - [r: VISIT_CONTAINS] - (m: UmlsEntity) - [s: ALSO_CODED_AS] -> (t: UmlsVocabulary)
   WHERE n.id = "4001"
   RETURN n, r, m, s, t
   
   """,
   contextualize=True)

l_result.view(graph_vis_options = l_options)


<div> 
<img src="./01_Images/Results_02.png" alt="Drawing" style="width: 800px;"/>
</div>

In [None]:

#  Checking the edges specifically ..
#

l_result = my_graph.query("""
   MATCH (n) - [r: VISIT_CONTAINS] - (m)
   RETURN COUNT(r)
   """,
   contextualize=False)
      #    
print(l_result)

l_result = my_graph.query("""
   MATCH (n: PatientVisit) - [r: VISIT_CONTAINS] -> (m: UmlsEntity)
   RETURN COUNT(r)
   """,
   contextualize=False)
      #    
print(l_result)

l_result = my_graph.query("""
   MATCH (m: UmlsEntity) - [r: VISIT_CONTAINS] -> (n: PatientVisit)
   RETURN COUNT(r)
   """,
   contextualize=False)
      #    
print(l_result)

#  Sample output,    (I don't think this is correct)
#
#        COUNT(r)
#     0     33916              #  This is 4x  8479
#     
#        COUNT(r)
#     0      8479
#     
#        COUNT(r)
#     0      8479


In [None]:

l_result = my_graph.query("""
   MATCH (m) - [s: ALSO_CODED_AS] - (t)
   RETURN COUNT(s)
   """,
   contextualize=False)
      #
print(l_result)

l_result = my_graph.query("""
   MATCH (m: UmlsEntity) - [s: ALSO_CODED_AS] -> (t: UmlsVocabulary)
   RETURN COUNT(s)
   """,
   contextualize=False)
      #
print(l_result)

l_result = my_graph.query("""
   MATCH (t: UmlsVocabulary) - [s: ALSO_CODED_AS] -> (m: UmlsEntity)
   RETURN COUNT(s)
   """,
   contextualize=False)
      #
print(l_result)

#  Sample output,    (I don't think this is correct)
#
#     0     33916              #  This is 4x  8479
#        COUNT(s)
#     0     43904              #  This is 4x  10976
#     
#        COUNT(s)
#     0     10976
#     
#        COUNT(s)
#     0     10976
