#  Setup: Display options

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


# Setup: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [116]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


--


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [117]:

#  CREATE GRAPH
#
my_graph = my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


<_Graph my_graph, dcLHS6ayyAudS2cxaUPxJyrdL81NUNjRmxeaN7cdUE5, 0>


# Step 01:  Process CSV files ..

In [None]:

#  This data was sourced from,
#     https://mtsamples.com/site/pages/browse.asp?type=96-Hematology%20-%20Oncology
#

#  Column headers,
#     id|medical_specialty|sample_name|description|body
#
df_PatientVisitNodes = pd.read_csv("./10_Data/22_OncologyCallNotes.txt", header = "infer", delimiter = "|")


#  90 Real lines of data.  (Some data is multi-line, and enclosed in double quotes.)
#
print("Number of CSV input lines: %d" % (len(df_PatientVisitNodes)))
print("")


#   Above, the field titled "body" contains embedded new lines; remove those
#   and place in a new field titled, "body_1line"
#
df_PatientVisitNodes["body_1line"] = df_PatientVisitNodes.body.map(lambda x: str(x).replace("\n", "") )
#  
#  And add a "LABEL" property
#
df_PatientVisitNodes["LABEL"]      = df_PatientVisitNodes.body.map(lambda x: "PatientVisit"           )


l_cntr = 0
   #
for l_each in df_PatientVisitNodes.itertuples():
   l_cntr += 1
      #
   if (l_cntr < 5):
      print("Record number: %d   Sample name: %-44s   Short text: %s" % (l_each.id, l_each.sample_name[0:43], l_each.body_1line[0:60]))
        
        
print("")
print("--")

#  Sample output,
#
#     Number of CSV input lines: 90
#     
#     Record number: 4001   Sample name: 3-Dimensional Simulation                       Short text: This patient is undergoing 3-dimensionally planned radiation
#     Record number: 4002   Sample name: Adrenalectomy & Umbilical Hernia Repair        Short text: PREOPERATIVE DIAGNOSES1. Adrenal mass, right sided.2. Umbi
#     Record number: 4003   Sample name: Anaplastic Astrocytoma - Letter                Short text: Month DD, YYYYXYZRE: ABCMEDICAL RECORD#: 123Dear Dr. 
#     Record number: 4004   Sample name: Anemia & Leukemia Followup                     Short text: CHIEF COMPLAINT:1. Chronic lymphocytic leukemia (CLL).2. A


#  Step 02: Enrich the above from a Google Web service ..

In [None]:

#  Google has a Web service to convert text into usable UMLS codes. See,
#        https://cloud.google.com/healthcare-api/docs/how-tos/nlp
#
#  In this cell, we begin to invoke this service on the text from the
#  cell above.

#  See also,
#     https://stackoverflow.com/questions/53472429/how-to-get-a-gcp-bearer-token-programmatically-with-python

#  Google:
#
#     .  We had to create an Auth Token, which produced a JSON file.
#        (Instruction in Url above.)
#
#     .  Our JSON file is at,
#              export GOOGLE_APPLICATION_CREDENTIALS="/mnt/hgfs/My.20/MyShare_1/46 Topics 2022/91 KG, All Prospects/13 KG, DataBricks, Google/10_Data/05_katana-clusters-beta-d8605ac248e7.json"
#              export GOOGLE_APPLICATION_CREDENTIALS="/home/jovyan/work/My_KG_NoteBooks/P1_Prospects/10_DataBricks_Google/10_Data/05_katana-clusters-beta-d8605ac248e7.json"
#
#     .  To extract the Auth Token, set the above, then run
#           gcloud auth application-default print-access-token


import google.auth
import google.auth.transport.requests
from google.oauth2 import service_account


#  This token times out often; you must rerun this block from time to time
#
l_credentials = service_account.Credentials.from_service_account_file(
   "/home/jovyan/work/My_KG_NoteBooks/P1_Prospects/10_DataBricks_Google/10_Data/05_katana-clusters-beta-d8605ac248e7.json",
   scopes=['https://www.googleapis.com/auth/cloud-platform'])
l_auth_req = google.auth.transport.requests.Request()
l_credentials.refresh(l_auth_req)
   #
l_token = l_credentials.token
    
print(l_token)


#  Sample output,
#
#     ya29.c.b0Aa9VdylvtWgGXBZyFkW5mADfcFiyBVitZsvkoKbHpCuXU7zGkgANRcho_ax5_SWWbiXfQj6cprlobWUlHnPkYEoKBCRw6   ...
    

In [None]:

#  Cool page; convert Curl command to Python,
#     https://reqbin.com/curl

#  Run the Google Web service, capture results
#

import requests
from requests.structures import CaseInsensitiveDict
   #
import json

    
#  Function, because we will loop on this below-
#
def f_enrich(i_arg1):
    
   url = "https://healthcare.googleapis.com/v1/projects/katana-clusters-beta/locations/us-central1/services/nlp:analyzeEntities"
   
   l_headers = CaseInsensitiveDict()
      #
   l_headers["Authorization"] = "Bearer " + l_token
   l_headers["Content-Type"]  = "application/json"
       
    
   l_data = """
      {{
      'nlpService':'projects/katana-clusters-beta/locations/us-central1/services/nlp',
      'documentContent':'{0}'
      }}
      """.format(i_arg1)
         #
   l_resp = requests.post(url, headers = l_headers, data = l_data)
      #
   return l_resp



l_response = f_enrich("Insulin regimen human 5 units IV administered.")
   #

print(l_response.status_code)
print("")
   #
l_data_asjson = json.loads(l_response.content)                       #  Get the response in json
print(json.dumps(l_data_asjson, indent = 3))                         #  This gives us a pretty print (easier to read)


#  Sample data after this cell-,
#


#  Step 00:  Sample data from above

In [None]:

#  Sample data from above,
#
#     200
#     
#     {
#        "entityMentions": [
#           {
#              "mentionId": "1",
#              "type": "MEDICINE",
#              "text": {
#                 "content": "Insulin regimen",
#                 "beginOffset": 0
#              },
#              "linkedEntities": [
#                 {
#                    "entityId": "UMLS/C0021641"
#                 },
#                 {
#                    "entityId": "UMLS/C0795635"
#                 },
#                 {
#                    "entityId": "UMLS/C1533581"
#                 },
#                 {
#                    "entityId": "UMLS/C3537244"
#                 },
#                 {
#                    "entityId": "UMLS/C3714501"
#                 }
#              ],
#              "temporalAssessment": {
#                 "value": "CURRENT",
#                 "confidence": 0.8573660850524902
#              },
#              "certaintyAssessment": {
#                 "value": "LIKELY",
#                 "confidence": 0.9751282930374146
#              },
#              "subject": {
#                 "value": "PATIENT",
#                 "confidence": 0.9995787739753723
#              },
#              "confidence": 0.6379408836364746
#           },
#           {
#              "mentionId": "2",
#              "type": "MED_DOSE",
#              "text": {
#                 "content": "5 units",
#                 "beginOffset": 22
#              },
#              "confidence": 0.7443782091140747
#           },
#           {
#              "mentionId": "3",
#              "type": "MED_ROUTE",
#              "text": {
#                 "content": "IV",
#                 "beginOffset": 30
#              },
#              "linkedEntities": [
#                 {
#                    "entityId": "UMLS/C0348016"
#                 }
#              ],
#              "confidence": 0.779011607170105
#           }
#        ],
#        "entities": [
#           {
#              "entityId": "UMLS/C0021641",
#              "preferredTerm": "Insulin",
#              "vocabularyCodes": [
#                 "FMA/83365",
#                 "LNC/LA15805-7",
#                 "LNC/LP14676-8",
#                 "LNC/LP16325-0",
#                 "LNC/LP32542-0",
#                 "LNC/LP70329-5",
#                 "LNC/MTHU002108",
#                 "LNC/MTHU019392",
#                 "MSH/D007328",
#                 "MTH/NOCODE"
#              ]
#           },
#           {
#              "entityId": "UMLS/C0348016",
#              "preferredTerm": "Intravenous",
#              "vocabularyCodes": [
#                 "LNC/LA9437-0",
#                 "LNC/LP32453-0",
#                 "MTH/NOCODE",
#                 "NCI/C13346"
#              ]
#           },
#           {
#              "entityId": "UMLS/C0795635",
#              "preferredTerm": "insulin, regular, human",
#              "vocabularyCodes": [
#                 "LNC/LP17001-6",
#                 "MSH/D061386",
#                 "MTH/NOCODE",
#                 "NCI/C29125",
#                 "RXNORM/253182",
#                 "VANDF/4017559",
#                 "VANDF/4017569",
#                 "VANDF/4019786"
#              ]
#           },
#           {
#              "entityId": "UMLS/C1533581",
#              "preferredTerm": "Therapeutic Insulin",
#              "vocabularyCodes": [
#                 "MTH/NOCODE",
#                 "NCI/C581"
#              ]
#           },
#           {
#              "entityId": "UMLS/C3537244",
#              "preferredTerm": "Insulins",
#              "vocabularyCodes": [
#                 "MSH/D061385",
#                 "MTH/NOCODE"
#              ]
#           },
#           {
#              "entityId": "UMLS/C3714501",
#              "preferredTerm": "Insulin Drug Class",
#              "vocabularyCodes": [
#                 "MTH/NOCODE",
#                 "VANDF/4021631"
#              ]
#           }
#        ],
#        "relationships": [
#           {
#              "subjectId": "1",
#              "objectId": "2",
#              "confidence": 0.9996469616889954
#           },
#           {
#              "subjectId": "1",
#              "objectId": "3",
#              "confidence": 0.9995671510696411
#           }
#        ]
#     }


#  Step 02:  (continued)

In [None]:

#  Run the above Web service against our first DataFrame
#

#  Recall df_PatientVisitNodes contains,
#
#     id|medical_specialty|sample_name|description|body|body_1line
#


df_enriched = pd.DataFrame(columns = ["id", "enrich_from_Google"])


for l_each in df_ReportNodes.itertuples():
   print("Processing id: %d" % (l_each.id))
      #
   l_response = f_enrich(l_each.body_1line)
   l_data_asjson = json.loads(l_response.content) 

   df_enriched = df_enriched.append({"id" : l_each.id, "enrich_from_Google" : l_data_asjson}, ignore_index = True)
    

print("")


l_cntr = 0
   #
for l_each in df_enriched.itertuples():
   l_cntr += 1
      #
   if (l_cntr < 5):
      print("Record number: %d   Short text: %s" % (l_each.id, str(l_each.enrich_from_Google)[0:60]))
        
        
print("")
print("--")

#  Sample output,
#
#     Processing id: 4001
#     Processing id: 4002
#     Processing id: 4003
#        ...
#         
#     Record number: 4001   Short text: {'error': {'code': 400, 'message': "Invalid JSON payload rec
#     Record number: 4002   Short text: {'entityMentions': [{'mentionId': '1', 'type': 'PROBLEM', 't
#     Record number: 4003   Short text: {'entityMentions': [{'mentionId': '1', 'type': 'PROBLEM', 't
#     Record number: 4004   Short text: {'entityMentions': [{'mentionId': '1', 'type': 'PROBLEM', 't
#     
#     --


In [None]:

#  Write this out to a file
#

l_file = "./10_Data/23_22_PlusGoogle.txt"

df_enriched.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)


print("--")


# Step 03:  Extract actual field data from the JSON Google gave us

In [None]:


#  The model for the df_enriched DataFrame is,
#     id|enrich_from_Google
#
#     with the second column being a JSON encoded string with its own model.
#
#  Here we loop thru said DataFrme, and make our resultant Nodes and Edges
#  for our graph.
#

import numpy as np


l_NumOfBadJson              = 0
l_NumOfRootKeyNotFound      = 0
   #
l_UmlsEntityNodes           = []
l_UmlsVocabularyNodes       = []
   #
l_PatientVisitToEntityEdge  = []
l_EntityToVocabularyEdge    = []


for l_each in df_enriched.itertuples():
    
   #  Before we cleaned up all of the JSON (random single quotes for possesive nouns,
   #  other), we would get errors similar to,
   #
   #      4001|"{'error': {'code': 400, 'message': ""Invalid JSON payload received. Expected , or } after key:value pair.\naced
   #         on the patient's skin or on the imm\n                    ^"", 'status': 'INVALID_ARGUMENT'}}"
   #
   #  Upon receipt of this condition, a root level key with the title "error" would be
   #  present. If those return, filter those out-
   #
   if ("error" in l_each.enrich_from_Google):
      l_NumOfBadJson += 1
    
   else:
      #
      #  "entities" should be a root level key to this dictionary
      #
      if ("entities" in l_each.enrich_from_Google):
         #
         #  Loop thru these
         #
         for l_entity in l_each.enrich_from_Google["entities"]:
            if ("entityId" in l_entity):
               #
               #  Build a dictionary that we will append to a set
               #
               l_recd1 = { "id": l_entity["entityId"], "entityId" : l_entity["entityId"], "LABEL": "UmlsEntity" }
               #
               #  If this key is present, add it to the dictionary
               #
               if ("preferredTerm" in l_entity):
                  #
                  #  We have both keys, build a record and add to our sets
                  #
                  l_recd1.update( {"preferredTerm": l_entity["preferredTerm"]} )
                     #
               l_UmlsEntityNodes.append(l_recd1)
               #
               #  Above was our list of Nodes of LABEL "UmlsEntity"
               #  
               #  Here we make our Edge list from;  PatientVisit --> UmlsEntity
               #
               l_recd2a = { "start_id": str(l_each.id), "end_id":   str(l_entity["entityId"])  , "TYPE": "VISIT_CONTAINS" }
               l_recd2b = { "end_id":   str(l_each.id), "start_id": str(l_entity["entityId"], "TYPE": "VISIT_CONTAINS" }
                  #
               l_PatientVisitToEntityEdge.append(l_recd2a)
               l_PatientVisitToEntityEdge.append(l_recd2b)
               #
               #  We are done with UmlsEntity and its Edge to PatientVisit
               #
               #  Also in "entities" is another array, "vocabularyCodes"
               #
               if ("vocabularyCodes" in l_entity):
                  for l_vocab in l_entity["vocabularyCodes"]:
                     #
                     #  Add to our set of Vocabulary Nodes
                     #
                     l_recd3 = { "id": l_vocab, "vocabularyCode": l_vocab, "LABEL": "UmlsVocabulary" }
                        #
                     l_UmlsVocabularyNodes.append(l_recd3)
                     #
                     #  And create the Edge from UmlsEntity --> UmlsVocabulary
                     #
                     l_recd4a = { "start_id": l_entity["entityId"], "end_id":   l_vocab, "TYPE": "ALSO_CODED_AS" }
                     l_recd4b = { "end_id":   l_entity["entityId"], "start_id": l_vocab, "TYPE": "ALSO_CODED_AS" }
                        #
                     l_EntityToVocabularyEdge.append(l_recd4a)
                     l_EntityToVocabularyEdge.append(l_recd4b)
            else:
               #
               #  No "entityId" in our record. This has never happened.
               #  We wont report, just pass.
               #
               pass
            
      else:
         l_NumOfRootKeyNotFound += 1
            
            
   ###


print("Number of 'Error' input records: %d   Number of 'No Root Key' input records: %d" % ( l_NumOfBadJson, l_NumOfRootKeyNotFound) )
print("")


#  l_UmlsEntityNodes, l_UmlsVocabularyNodes, l_PatientVisitToEntityEdge, and l_EntityToVocabularyEdge
#     are currently arrays, and have duplicate records.
#
#  Arguably; these should have been sets(), [ then ] made into arrays or DataFrames.
#     (Memory versus CPU. And .. .. two processing lops versus one.)
#
#  Convert these to DataFrames and remove duplicates
#

df_UmlsEntityNodes           = pd.DataFrame.from_records(l_UmlsEntityNodes          ).drop_duplicates()
df_UmlsVocabularyNodes       = pd.DataFrame.from_records(l_UmlsVocabularyNodes      ).drop_duplicates()
   #
df_PatientVisitToEntityEdge  = pd.DataFrame.from_records(l_PatientVisitToEntityEdge ).drop_duplicates()
df_EntityToVocabularyEdge    = pd.DataFrame.from_records(l_EntityToVocabularyEdge   ).drop_duplicates()


print("Number of PatientVisit nodes: %d   UmlsEntity nodes: %d   UmlsVocabulary nodes: %d" % ( len(df_PatientVisitNodes), len(df_UmlsEntityNodes), len(df_UmlsVocabularyNodes) ))
print("")
print("Edges PatientVisit --> UmlsEntity: %d   UmlsEntity --> UmlsVocabulary: %d" % ( len(df_PatientVisitToEntityEdge), len(df_EntityToVocabularyEdge) ))
print("")

print("--")    

#  Sample output,
#
#     Number of 'Error' input records: 0   Number of 'No Root Key' input records: 0
#     
#     Number of PatientVisit nodes: 90   UmlsEntity nodes: 3115   UmlsVocabulary nodes: 8860
#     
#     Edges PatientVisit --> UmlsEntity: 16958   UmlsEntity --> UmlsVocabulary: 21952

    

#  Step 04: Create the Graph

In [118]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_PatientVisitNodes,
      id_column             = "id",
      id_space              = "PatientVisit",  
      label                 = "PatientVisit",  
      ) 
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_UmlsEntityNodes,
      id_column             = "id",
      id_space              = "UmlsEntity",  
      label                 = "UmlsEntity",  
      ) 
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_UmlsVocabularyNodes,
      id_column             = "id",
      id_space              = "UmlsVocabulary",  
      label                 = "UmlsVocabulary",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.node_id_property_name("id")                        #  This line is required when you are not inserting any Nodes, only Edges
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "UmlsVocabulary",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )
   df_importer.node_id_property_name("id")
   df_importer.insert()


      ###


display("Number of nodes: %d   Numbers of edges: %d" % ( my_graph.num_nodes(), my_graph.num_edges() ))


display("--")


#  Sample output,
#







          0/? [?op/s]

          0/? [?op/s]



          0/? [?op/s]

          0/? [?op/s]



          0/? [?op/s]

          0/? [?op/s]



          0/? [?op/s]


Host 0 errors:
Traceback (most recent call last):
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/worker/worker.py", line 86, in execute
    value = function(graph)
  File "/opt/conda/lib/python3.8/site-packages/katana_enterprise/remote/dataframe_importer/__init__.py", line 453, in remote_write_intermediate_parquet_files
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/remote/dataframe_importer/__init__.py", line 208, in _write_intermediate_parquet_files
    edge_header = "\n".join(_write_dfs_to_parquet(edge_df_infos, "edges"))
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/remote/dataframe_importer/__init__.py", line 201, in _write_dfs_to_parquet
    dask.dataframe.to_parquet(
  File "/opt/miniconda/lib/python3.8/site-packages/dask/dataframe/io/parquet/core.py", line 963, in to_parquet
    out = out.compute(**compute_kwargs)
  File "/opt/miniconda/lib/python3.8/site-packages/dask/base.py", line 315, in compute
    (result

ArrowInvalid: Could not convert 'UMLS/C0877488' with type str: tried to convert to int64

          0/? [?op/s]

          0/? [?op/s]

'Number of nodes: 12065   Numbers of edges: 0'

In [115]:

#  Produce a visual graph

from katana_visualization_widget import GraphVisOptions, NodeVisOption, EdgeVisOption, ANY

l_options = GraphVisOptions(
   node_options = [
      NodeVisOption("PatientVisit",   label="sample_name"),
      NodeVisOption("UmlsEntity",     label="id"         ),
      NodeVisOption("UmlsVocabulary", label="id"         ),
   ],
   #  edge_options = [
   #     EdgeVisOption(["VISIT_CONTAINS"], label="start_id"),
   #     EdgeVisOption(["ALSO_CODED_AS" ], label="start_id"),
   #  ]
   )
    
    
l_result = my_graph.query("""

   MATCH (n) -[r]-> (m)
   RETURN n, r, m
   LIMIT 1000 
   
   """,
   contextualize=True)

l_result.view(graph_vis_options = l_options)


          0/? [?op/s]

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…