#  Setup: Display options

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


# Setup: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPH

my_graph = my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


In [None]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


# Step 01:  Process First CSV file ..

In [None]:

#  This data was sourced from,
#     https://mtsamples.com/site/pages/browse.asp?type=96-Hematology%20-%20Oncology
#

#  Column headers,
#     id,description,medical_specialty,sample_name,transcription,keywords
#
df_PatientVisitNodes = pd.read_csv("./10_Data/33_32_Processed.txt", header = "infer", delimiter = ",")


print("Number of CSV input lines: %d" % (len(df_PatientVisitNodes)))
print("")


#  And add a "LABEL" property
#
df_PatientVisitNodes["LABEL"]      = df_PatientVisitNodes.id.map(lambda x: "PatientVisit"           )


#  The original/imported "id" column is Null/None for some rows. Fix that-
#
df_PatientVisitNodes = df_PatientVisitNodes.rename(columns={"id": "id_original"})
   #
l_initial = 4000 
   #
df_PatientVisitNodes["id"] = range(l_initial, len(df_PatientVisitNodes) + l_initial)
df_PatientVisitNodes["id"] = df_PatientVisitNodes.id.map(lambda x: "PV-" + str(x) )


for l_each in df_PatientVisitNodes.head(5).itertuples():
   print("Id: %s   Desc: %-32s   Specialty: %-32s   Name: %-32s   Transcription: %-32s   Keywords: %s" % (
      l_each.id,
      l_each.description[0:31],
      l_each.medical_specialty[0:31],
      l_each.sample_name[0:31],
      l_each.transcription[0:31],
      l_each.keywords[0:31],
      ))
        
        
print("")
   #
print("--")

#  Sample output,
#
#     Number of CSV input lines: 4999
#     
#     Id: PV-4000   Desc:  A 23-year-old white female pre    Specialty:  Allergy / Immunology              Name:  Allergic Rhinitis                 Transcription: SUBJECTIVE:,  This 23-year-old     Keywords: allergy / immunology, allergic 
#     Id: PV-4001   Desc:  Consult for laparoscopic gastr    Specialty:  Bariatrics                        Name:  Laparoscopic Gastric Bypass Co    Transcription: PAST MEDICAL HISTORY:, He has d    Keywords: bariatrics, laparoscopic gastri
#     Id: PV-4002   Desc:  Consult for laparoscopic gastr    Specialty:  Bariatrics                        Name:  Laparoscopic Gastric Bypass Co    Transcription: HISTORY OF PRESENT ILLNESS: , I    Keywords: bariatrics, laparoscopic gastri
#     Id: PV-4003   Desc:  2-D M-Mode. Doppler.              Specialty:  Cardiovascular / Pulmonary        Name:  2-D Echocardiogram - 1            Transcription: 2-D M-MODE: , ,1.  Left atrial     Keywords: cardiovascular / pulmonary, 2-d


In [None]:

#  From the sample above,
#
#     .  Check for any Null/None values, and fix those
#

def f_check(i_arg1):
   l_return = ( i_arg1.isnull().sum() / i_arg1.index.size * 100)
      #
   print(l_return)
   print("")

f_check(df_PatientVisitNodes)

#  Sample output,
#
#     id_original          0.000000
#     description          0.000000
#     medical_specialty    0.000000
#     sample_name          0.000000
#     transcription        0.660132
#     keywords             0.000000
#     LABEL                0.000000
#     id                   0.000000
#     dtype: float64


df_PatientVisitNodes["transcription_clean"] = df_PatientVisitNodes["transcription"].fillna("None"[0])
   #
f_check(df_PatientVisitNodes)

#  Sample output,
#
#     id_original            0.000000
#     description            0.000000
#     medical_specialty      0.000000
#     sample_name            0.000000
#     transcription          0.660132
#     keywords               0.000000
#     LABEL                  0.000000
#     id                     0.000000
#     transcription_clean    0.000000
#     dtype: float64


In [None]:

#  From the sample above,
#
#     .  Check both medical_specialty and sample_name for uniquess
#

print("Total rows: %d   Unique-Specialty: %d   Unique-Name: %d" % (
   len(df_PatientVisitNodes),
   len(df_PatientVisitNodes.medical_specialty.unique()),
   len(df_PatientVisitNodes.sample_name.unique()),
   ))

#  Sample output
#
#     Total rows: 4999   Unique-Specialty: 40   Unique-Name: 2377


#  Check ths distribution for medical_specialty
#
print(df_PatientVisitNodes.value_counts(subset=["medical_specialty"]))

#  Sample output,
#
#     medical_specialty             
#      Surgery                          1103
#      Consult - History and Phy.        516
#      Cardiovascular / Pulmonary        372
#      Orthopedic                        355
#      Radiology                         273
#      General Medicine                  259
#      Gastroenterology                  230
#      Neurology                         223
#      SOAP / Chart / Progress Notes     166
#      Obstetrics / Gynecology           160
#      Urology                           158
#      Discharge Summary                 108
#      ENT - Otolaryngology               98
#      Neurosurgery                       94
#      Hematology - Oncology              90
#      Ophthalmology                      83
#      Nephrology                         81
#      Emergency Room Reports             75
#      Pediatrics - Neonatal              70
#      Pain Management                    62
#      Psychiatry / Psychology            53
#      Office Notes                       51
#      Podiatry                           47
#      Dermatology                        29
#      Dentistry                          27
#      Cosmetic / Plastic Surgery         27
#      Letters                            23
#      Physical Medicine - Rehab          21
#      Sleep Medicine                     20
#      Endocrinology                      19
#      Bariatrics                         18
#      IME-QME-Work Comp etc.             16
#      Chiropractic                       14
#      Diets and Nutritions               10
#      Rheumatology                       10
#      Speech - Language                   9
#      Autopsy                             8
#      Lab Medicine - Pathology            8
#      Allergy / Immunology                7
#      Hospice - Palliative Care           6


In [None]:

#   Build a set of Nodes and Edges/Relationships from "medical_specialty" ..
#

l_MedicalSpecialtyNodes_Set                = set()
l_MedicalSpecialtyNodes                    = []
   #
l_PatientVisitToMedicalSpecialtyEdges_N    = []
l_PatientVisitToMedicalSpecialtyEdges_S    = []


#  Start with just the nodes
#
#     .  Use a set() to generate a unique list of values
#     .  Convert to an array of records
#     .  And convert that to a DataFrame
#
for l_each in df_PatientVisitNodes.itertuples():
   #
   #  strip(),  there was leading whitespace on our column value
   #
   l_MedicalSpecialtyNodes_Set.add(l_each.medical_specialty.strip())
      #
for l_each in zip( range(len(l_MedicalSpecialtyNodes_Set)), l_MedicalSpecialtyNodes_Set):
   l_recd = { "id": ("MS-" + str(l_each[0])), "medical_specialty" : l_each[1], "LABEL": "MedicalSpecialty" }
      # 
   l_MedicalSpecialtyNodes.append(l_recd)
      #
df_MedicalSpecialtyNodes = pd.DataFrame.from_records(l_MedicalSpecialtyNodes)
   #
print("Number of Medical Specialities: %d   %d   %d" % (
   len(l_MedicalSpecialtyNodes_Set),
   len(l_MedicalSpecialtyNodes),
   len(df_MedicalSpecialtyNodes),
   ))
print("")


#  Check our work
#
for l_each in df_MedicalSpecialtyNodes.head(5).itertuples():
   print("Id: %-8s   Medical Specialty: %-32s   LABEL: %s" % (
      l_each.id,
      l_each.medical_specialty,
      l_each.LABEL,
      ))


#  And now create the edges
#
for l_each in df_PatientVisitNodes.itertuples():
   #
   #  Get the id from the medical_specialty data set
   #
   #     .  Again we strip because of the leading whitespace in the source data
   #     .  This form of equality returns a Series, hence the to_list() and [0]
   #
   l_msid = df_MedicalSpecialtyNodes.loc[df_MedicalSpecialtyNodes["medical_specialty"] == l_each.medical_specialty.strip()].id.to_list()[0]
      # 
   l_recd_N = { "start_id": l_each.id, "end_id"  : l_msid   , "TYPE": "IS_OF_SPECIALTY" }
   l_recd_S = { "start_id": l_msid   , "end_id"  : l_each.id, "TYPE": "IS_OF_SPECIALTY" }
      #
   l_PatientVisitToMedicalSpecialtyEdges_N.append(l_recd_N)
   l_PatientVisitToMedicalSpecialtyEdges_S.append(l_recd_S)


df_PatientVisitToMedicalSpecialtyEdges_N = pd.DataFrame.from_records(l_PatientVisitToMedicalSpecialtyEdges_N)
df_PatientVisitToMedicalSpecialtyEdges_S = pd.DataFrame.from_records(l_PatientVisitToMedicalSpecialtyEdges_S)
   #
print("")
print("Number of Edges,  PatientVisit --> MedicalSpecialty: %d   %d" % (
   len(df_PatientVisitToMedicalSpecialtyEdges_N),
   len(df_PatientVisitToMedicalSpecialtyEdges_S),
))


#  Check our work
#
print("")
   #
for l_each in df_PatientVisitToMedicalSpecialtyEdges_N.head(5).itertuples():
   print("Start Id: %-8s   End Id: %-8s   TYPE: %s" % (l_each.start_id, l_each.end_id, l_each.TYPE))
        
print("")
   #
for l_each in df_PatientVisitToMedicalSpecialtyEdges_S.head(5).itertuples():
   print("Start Id: %-8s   End Id: %-8s   TYPE: %s" % (l_each.start_id, l_each.end_id, l_each.TYPE))
        
        
#  Sample output,
#
#     Number of Medical Specialities: 40   40   40
#     
#     Id: MS-0       Medical Specialty: Surgery                            LABEL: MedicalSpecialty
#     Id: MS-1       Medical Specialty: Neurosurgery                       LABEL: MedicalSpecialty
#     Id: MS-2       Medical Specialty: Orthopedic                         LABEL: MedicalSpecialty
#     Id: MS-3       Medical Specialty: SOAP / Chart / Progress Notes      LABEL: MedicalSpecialty
#     
#     Number of Edges,  PatientVisit --> MedicalSpecialty: 4999   4999
#     
#     Start Id: PV-4000    End Id: MS-9       TYPE: IS_OF_SPECIALTY
#     Start Id: PV-4001    End Id: MS-16      TYPE: IS_OF_SPECIALTY
#     Start Id: PV-4002    End Id: MS-16      TYPE: IS_OF_SPECIALTY
#     Start Id: PV-4003    End Id: MS-10      TYPE: IS_OF_SPECIALTY
#     
#     Start Id: MS-9       End Id: PV-4000    TYPE: IS_OF_SPECIALTY
#     Start Id: MS-16      End Id: PV-4001    TYPE: IS_OF_SPECIALTY
#     Start Id: MS-16      End Id: PV-4002    TYPE: IS_OF_SPECIALTY
#     Start Id: MS-10      End Id: PV-4003    TYPE: IS_OF_SPECIALTY


In [None]:

#
#  **  This cell takes 1-2 minutes to complete ..
#


#  Same basic function as above; split out "keywords"
#

l_KeywordNodes_Set                = set()
l_KeywordNodes                    = []
   #
l_PatientVisitToKeywordEdges_N    = []
l_PatientVisitToKeywordEdges_S    = []


#  Start with just the nodes
#
#     .  Use a set() to generate a unique list of values
#     .  Convert to an array of records
#     .  And convert that to a DataFrame
#
for l_each in df_PatientVisitNodes.itertuples():
   #
   #  Doing this a little differently than above because lists are mutable
   #
   l_KeywordNodes_Set.update(l_each.keywords.split(","))
      #
for l_each in zip( range(len(l_KeywordNodes_Set)), l_KeywordNodes_Set):
   if (len(l_each[1]) > 2):                                                   #  Some of the keywords were (Null/None)
      l_recd = { "id": ("KW-" + str(l_each[0])),
         "keyword" : l_each[1].strip(), "LABEL": "Keyword" }                  #  strip(), Removing leading whitespace
            # 
      l_KeywordNodes.append(l_recd)
         #
df_KeywordNodes = pd.DataFrame.from_records(l_KeywordNodes)
   #
print("Number of Keywords: %d   %d   %d" % (
   len(l_KeywordNodes_Set),
   len(l_KeywordNodes),
   len(df_KeywordNodes),
   ))    
print("")


#  Check our work
#
for l_each in df_KeywordNodes.head(5).itertuples():
   print("Id: %-8s   Keyword: %-32s   LABEL: %s" % (
      l_each.id,
      l_each.keyword,
      l_each.LABEL,
      ))
print("")

    
#  And now create the edges
#
#  A little different than cells above because we have keywords is an array
#
for l_each1 in df_PatientVisitNodes.itertuples():
   for l_each2 in l_each1.keywords.split(","):   
      #
      #  Get the id from the keywords data set.
      #  What we get back from this operation is a Series, hence, to_list()
      #
      l_kwid =  df_KeywordNodes.loc[df_KeywordNodes["keyword"] == l_each2.strip()].id.to_list()
         #
      if (len(l_kwid) > 0):
            # 
         l_recd_N = { "start_id": l_each1.id, "end_id" : l_kwid[0] , "TYPE": "IS_OF_KEYWORD" }
         l_recd_S = { "start_id": l_kwid[0] , "end_id" : l_each1.id, "TYPE": "IS_OF_KEYWORD" }
            #
         l_PatientVisitToKeywordEdges_N.append(l_recd_N)
         l_PatientVisitToKeywordEdges_S.append(l_recd_S)
            #
df_PatientVisitToKeywordEdges_N = pd.DataFrame.from_records(l_PatientVisitToKeywordEdges_N)
df_PatientVisitToKeywordEdges_S = pd.DataFrame.from_records(l_PatientVisitToKeywordEdges_S)
   #
print("Number of Edges,  PatientVisit --> Keywords: %d   %d" % (
   len(df_PatientVisitToKeywordEdges_N),
   len(df_PatientVisitToKeywordEdges_S),
))
    
    
#  Check our work
#
print("")
   #
for l_each in df_PatientVisitToKeywordEdges_N.head(5).itertuples():
   print("Start Id: %-8s   End Id: %-8s   TYPE: %s" % (l_each.start_id, l_each.end_id, l_each.TYPE))
        
print("")
   #
for l_each in df_PatientVisitToKeywordEdges_S.head(5).itertuples():
   print("Start Id: %-8s   End Id: %-8s   TYPE: %s" % (l_each.start_id, l_each.end_id, l_each.TYPE))


#  Sample output
#
#     Number of Keywords: 10445   10443   10443
#     
#     Id: KW-1       Keyword: heart tones                        LABEL: Keyword
#     Id: KW-2       Keyword: mastopexy                          LABEL: Keyword
#     Id: KW-3       Keyword: temporal lobes                     LABEL: Keyword
#     Id: KW-4       Keyword: heart attacks                      LABEL: Keyword
#     Id: KW-5       Keyword: digital laceration                 LABEL: Keyword
#     
#     Number of Edges,  PatientVisit --> Keywords: 52838   52838
#     
#     Start Id: PV-4000    End Id: KW-10014   TYPE: IS_OF_KEYWORD
#     Start Id: PV-4000    End Id: KW-5669    TYPE: IS_OF_KEYWORD
#     Start Id: PV-4000    End Id: KW-3019    TYPE: IS_OF_KEYWORD
#     Start Id: PV-4000    End Id: KW-9372    TYPE: IS_OF_KEYWORD
#     Start Id: PV-4000    End Id: KW-7627    TYPE: IS_OF_KEYWORD
#     
#     Start Id: KW-10014   End Id: PV-4000    TYPE: IS_OF_KEYWORD
#     Start Id: KW-5669    End Id: PV-4000    TYPE: IS_OF_KEYWORD
#     Start Id: KW-3019    End Id: PV-4000    TYPE: IS_OF_KEYWORD
#     Start Id: KW-9372    End Id: PV-4000    TYPE: IS_OF_KEYWORD
#     Start Id: KW-7627    End Id: PV-4000    TYPE: IS_OF_KEYWORD


# Step 00:  Save our work ..

In [None]:

#  We've created several sets of Nodes, and some bi-directional edges.
#  Save these to file.
#


#  For Patient Visit, we'll take this chance to drop some columns
#
l_file = "./10_Data/N_01_00_PatientVist.txt"
   #
df_PatientVisitNodes[["id", "description", "sample_name", "transcription"]].to_csv(l_file, sep = "|", encoding = "utf-8", index = False)


#  For Medical Specialty, we'll keep all columns
#
l_file = "./10_Data/N_02_00_MedicalSpecialty.txt"
   #
df_MedicalSpecialtyNodes.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)


#  For Keyword, we'll keep all columns
#
l_file = "./10_Data/N_03_00_Keyword.txt"
   #
df_KeywordNodes.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)


   ###
    

#  Our Edges
#
l_file = "./10_Data/E_01_00_N_PatientVistToMedicalSpecialty.txt"
   #
df_PatientVisitToMedicalSpecialtyEdges_N.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)
   #
l_file = "./10_Data/E_01_00_S_PatientVistToMedicalSpecialty.txt"
   #
df_PatientVisitToMedicalSpecialtyEdges_S.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)


l_file = "./10_Data/E_02_00_N_PatientVistToKeyword.txt"
   #
df_PatientVisitToKeywordEdges_N.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)
   #
l_file = "./10_Data/E_02_00_S_PatientVistToKeyword.txt"
   #
df_PatientVisitToKeywordEdges_S.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)


print("--")



#  Step 02: Enrich the above from a Google Web service ..

In [None]:

#  Google has a Web service to convert text into usable UMLS codes. See,
#        https://cloud.google.com/healthcare-api/docs/how-tos/nlp
#
#  In this cell, we begin to invoke this service on the text from the
#  cell above.

#  See also,
#     https://stackoverflow.com/questions/53472429/how-to-get-a-gcp-bearer-token-programmatically-with-python

#  Google:
#
#     .  We had to create an Auth Token, which produced a JSON file.
#        (Instruction in Url above.)
#
#     .  Our JSON file is at,
#              export GOOGLE_APPLICATION_CREDENTIALS="/mnt/hgfs/My.20/MyShare_1/46 Topics 2022/91 KG, All Prospects/13 KG, DataBricks, Google/10_Data/05_katana-clusters-beta-d8605ac248e7.json"
#              export GOOGLE_APPLICATION_CREDENTIALS="/home/jovyan/work/My_KG_NoteBooks/P1_Prospects/10_DataBricks_Google/10_Data/05_katana-clusters-beta-d8605ac248e7.json"
#
#     .  To extract the Auth Token, set the above, then run
#           gcloud auth application-default print-access-token


import google.auth
import google.auth.transport.requests
from google.oauth2 import service_account


#  This token times out often; you must rerun this block from time to time
#
l_credentials = service_account.Credentials.from_service_account_file(
   "/home/jovyan/work/My_KG_NoteBooks/P1_Prospects/13_DataBricks_Google/10_Data/05_katana-clusters-beta-d8605ac248e7.json",
   scopes=['https://www.googleapis.com/auth/cloud-platform'])
l_auth_req = google.auth.transport.requests.Request()
l_credentials.refresh(l_auth_req)
   #
l_token = l_credentials.token
    
print("")
print("Token: " + l_token[0:120] + " ...")
print("")


#  Sample output,
#
#     Token: ya29.c.b0Aa9VdylvtWgGXBZyFkW5mADfcFiyBVitZsvkoKbHpCuXU7zGkgANRcho_ax5_SWWbiXfQj6cprlobWUlHnPkYEoKBCRw6   ...
    

In [None]:

#  Cool page; convert Curl command to Python,
#     https://reqbin.com/curl

#  Run the Google Web service, capture results
#

import requests
from requests.structures import CaseInsensitiveDict
   #
import json

    
#  Function, because we will loop on this below-
#
def f_enrich(i_arg1):
    
   url = "https://healthcare.googleapis.com/v1/projects/katana-clusters-beta/locations/us-central1/services/nlp:analyzeEntities"
   
   l_headers = CaseInsensitiveDict()
      #
   l_headers["Authorization"] = "Bearer " + l_token
   l_headers["Content-Type"]  = "application/json"
       
    
   l_data = """
      {{
      'nlpService':'projects/katana-clusters-beta/locations/us-central1/services/nlp',
      'documentContent':'{0}'
      }}
      """.format(i_arg1)
         #
   l_resp = requests.post(url, headers = l_headers, data = l_data)
      #
   return l_resp



l_response = f_enrich("Insulin regimen human 5 units IV administered.")
   #

print(l_response.status_code)
print("")
   #
l_data_asjson = json.loads(l_response.content)                       #  Get the response in json
print(json.dumps(l_data_asjson, indent = 3))                         #  This gives us a pretty print (easier to read)


#  Sample data after this cell-,
#


#  Step 00:  Sample data from above

In [None]:

#  Sample data from above,
#
#     200
#     
#     {
#        "entityMentions": [
#           {
#              "mentionId": "1",
#              "type": "MEDICINE",
#              "text": {
#                 "content": "Insulin regimen",
#                 "beginOffset": 0
#              },
#              "linkedEntities": [
#                 {
#                    "entityId": "UMLS/C0021641"
#                 },
#                 {
#                    "entityId": "UMLS/C0795635"
#                 },
#                 {
#                    "entityId": "UMLS/C1533581"
#                 },
#                 {
#                    "entityId": "UMLS/C3537244"
#                 },
#                 {
#                    "entityId": "UMLS/C3714501"
#                 }
#              ],
#              "temporalAssessment": {
#                 "value": "CURRENT",
#                 "confidence": 0.8573660850524902
#              },
#              "certaintyAssessment": {
#                 "value": "LIKELY",
#                 "confidence": 0.9751282930374146
#              },
#              "subject": {
#                 "value": "PATIENT",
#                 "confidence": 0.9995787739753723
#              },
#              "confidence": 0.6379408836364746
#           },
#           {
#              "mentionId": "2",
#              "type": "MED_DOSE",
#              "text": {
#                 "content": "5 units",
#                 "beginOffset": 22
#              },
#              "confidence": 0.7443782091140747
#           },
#           {
#              "mentionId": "3",
#              "type": "MED_ROUTE",
#              "text": {
#                 "content": "IV",
#                 "beginOffset": 30
#              },
#              "linkedEntities": [
#                 {
#                    "entityId": "UMLS/C0348016"
#                 }
#              ],
#              "confidence": 0.779011607170105
#           }
#        ],
#        "entities": [
#           {
#              "entityId": "UMLS/C0021641",
#              "preferredTerm": "Insulin",
#              "vocabularyCodes": [
#                 "FMA/83365",
#                 "LNC/LA15805-7",
#                 "LNC/LP14676-8",
#                 "LNC/LP16325-0",
#                 "LNC/LP32542-0",
#                 "LNC/LP70329-5",
#                 "LNC/MTHU002108",
#                 "LNC/MTHU019392",
#                 "MSH/D007328",
#                 "MTH/NOCODE"
#              ]
#           },
#           {
#              "entityId": "UMLS/C0348016",
#              "preferredTerm": "Intravenous",
#              "vocabularyCodes": [
#                 "LNC/LA9437-0",
#                 "LNC/LP32453-0",
#                 "MTH/NOCODE",
#                 "NCI/C13346"
#              ]
#           },
#           {
#              "entityId": "UMLS/C0795635",
#              "preferredTerm": "insulin, regular, human",
#              "vocabularyCodes": [
#                 "LNC/LP17001-6",
#                 "MSH/D061386",
#                 "MTH/NOCODE",
#                 "NCI/C29125",
#                 "RXNORM/253182",
#                 "VANDF/4017559",
#                 "VANDF/4017569",
#                 "VANDF/4019786"
#              ]
#           },
#           {
#              "entityId": "UMLS/C1533581",
#              "preferredTerm": "Therapeutic Insulin",
#              "vocabularyCodes": [
#                 "MTH/NOCODE",
#                 "NCI/C581"
#              ]
#           },
#           {
#              "entityId": "UMLS/C3537244",
#              "preferredTerm": "Insulins",
#              "vocabularyCodes": [
#                 "MSH/D061385",
#                 "MTH/NOCODE"
#              ]
#           },
#           {
#              "entityId": "UMLS/C3714501",
#              "preferredTerm": "Insulin Drug Class",
#              "vocabularyCodes": [
#                 "MTH/NOCODE",
#                 "VANDF/4021631"
#              ]
#           }
#        ],
#        "relationships": [
#           {
#              "subjectId": "1",
#              "objectId": "2",
#              "confidence": 0.9996469616889954
#           },
#           {
#              "subjectId": "1",
#              "objectId": "3",
#              "confidence": 0.9995671510696411
#           }
#        ]
#     }


#  Step 02:  (continued)

In [60]:

#
#  **  This cell takes minutes to complete ..
#

#  Run the above Web service against our first DataFrame
#
#  Recall df_PatientVisitNodes contains,
#
#     id|description|sample_name|transcription
#

df_enriched = pd.DataFrame(columns = ["id", "enrich_from_Google"])
   #
print("")
print("Process nearly 50,000 rows ..")

l_cntr = 0
   #
for l_each in df_PatientVisitNodes.itertuples():
   l_cntr += 1
      #
   if (l_cntr % 5000 == 0):
      print("")
      print("Processed so far: " %d (l_cntr))
   else:
      print(".", end = "")
         #
   l_tosend = bytes(l_each.transcription).decode("utf-8")    
       #
   l_response = f_enrich(l_tosend)
   l_data_asjson = json.loads(l_response.content) 

   df_enriched = df_enriched.append({"id" : l_each.id, "enriched_from_Google" : l_data_asjson}, ignore_index = True)

print("")
print("")


for l_each in df_enriched.head(5).itertuples():
   print("Record number: %d   Short text: %s" % (l_each.id, str(l_each.enriched_from_Google)[0:60]))
        
        
print("")
print("--")

#  Sample output,
#


.............................................................................................................................................................................

UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' in position 2693: Body ('’') is not valid Latin-1. Use body.encode('utf-8') if you want to send it encoded in UTF-8.

In [None]:

#  Write this out to a file
#

l_file = "./10_Data/34_33_PlusGoogle.txt"

df_enriched.to_csv(l_file, sep = "|", encoding = "utf-8", index = False)


print("--")


# Step 03:  Extract actual field data from the JSON Google gave us

In [None]:


#  The model for the df_enriched DataFrame is,
#     id|enrich_from_Google
#
#     with the second column being a JSON encoded string with its own model.
#
#  Here we loop thru said DataFrme, and make our resultant Nodes and Edges
#  for our graph.
#

#  import numpy as np


l_NumOfBadJson                = 0
l_NumOfRootKeyNotFound        = 0
   #
l_UmlsEntityNodes             = []
l_UmlsVocabularyNodes         = []
   #
l_PatientVisitToEntityEdge_N  = []
l_PatientVisitToEntityEdge_S  = []
l_EntityToVocabularyEdge_N    = []
l_EntityToVocabularyEdge_S    = []


for l_each in df_enriched.itertuples():
    
   #  Before we cleaned up all of the JSON (random single quotes for possesive nouns,
   #  other), we would get errors similar to,
   #
   #      4001|"{'error': {'code': 400, 'message': ""Invalid JSON payload received. Expected , or } after key:value pair.\naced
   #         on the patient's skin or on the imm\n                    ^"", 'status': 'INVALID_ARGUMENT'}}"
   #
   #  Upon receipt of this condition, a root level key with the title "error" would be
   #  present. If those return, filter those out-
   #
   if ("error" in l_each.enrich_from_Google):
      l_NumOfBadJson += 1
    
   else:
      #
      #  "entities" should be a root level key to this dictionary
      #
      if ("entities" in l_each.enrich_from_Google):
         #
         #  Loop thru these
         #
         for l_entity in l_each.enrich_from_Google["entities"]:
            if ("entityId" in l_entity):
               #
               #  Build a dictionary that we will append to an array
               #
               l_recd1 = { "id": l_entity["entityId"], "entityId" : l_entity["entityId"], "LABEL": "UmlsEntity" }
               #
               #  If this key is present, add it to the dictionary
               #
               if ("preferredTerm" in l_entity):
                  #
                  #  We have an additional key, add to the record and add to our array
                  #
                  l_recd1.update( {"preferredTerm": l_entity["preferredTerm"]} )
                     #
               l_UmlsEntityNodes.append(l_recd1)
               #
               #  Above was our list of Nodes of LABEL "UmlsEntity"
               #  
               #  Here we make our Edge list from;  PatientVisit --> UmlsEntity
               #
               #  We make all Edges to be bi-directional. As a heterogeneous relationship,
               #  we need two arrays.
               #
               l_recd2a = { "start_id": str(l_each.id)           , "end_id":   str(l_entity["entityId"]), "TYPE": "VISIT_CONTAINS" }
               l_recd2b = { "start_id": str(l_entity["entityId"]), "end_id":   str(l_each.id)           , "TYPE": "VISIT_CONTAINS" }
                  #
               l_PatientVisitToEntityEdge_N.append(l_recd2a)
               l_PatientVisitToEntityEdge_S.append(l_recd2b)
               #
               #  We are done with UmlsEntity and its Edge to PatientVisit
               #
               #  Also in "entities" is another array, "vocabularyCodes"
               #
               if ("vocabularyCodes" in l_entity):
                  for l_vocab in l_entity["vocabularyCodes"]:
                     #
                     #  Add to our set of Vocabulary Nodes
                     #
                     l_recd3 = { "id": l_vocab, "vocabularyCode": l_vocab, "LABEL": "UmlsVocabulary" }
                        #
                     l_UmlsVocabularyNodes.append(l_recd3)
                     #
                     #  And create the Edge from UmlsEntity --> UmlsVocabulary
                     #
                     l_recd4a = { "start_id": str(l_entity["entityId"]), "end_id": str(l_vocab             ), "TYPE": "ALSO_CODED_AS" }
                     l_recd4b = { "start_id": str(l_vocab             ), "end_id": str(l_entity["entityId"]), "TYPE": "ALSO_CODED_AS" }
                        #
                     l_EntityToVocabularyEdge_N.append(l_recd4a)
                     l_EntityToVocabularyEdge_S.append(l_recd4b)
            else:
               #
               #  No "entityId" in our record. This has never happened.
               #  We wont report, just pass.
               #
               pass
            
      else:
         l_NumOfRootKeyNotFound += 1
            
            
   ###


print("Number of 'Error' input records: %d   Number of 'No Root Key' input records: %d" % ( l_NumOfBadJson, l_NumOfRootKeyNotFound) )
print("")


#  l_UmlsEntityNodes, l_UmlsVocabularyNodes, l_PatientVisitToEntityEdge, and l_EntityToVocabularyEdge
#     are currently arrays, and have duplicate records.
#
#  Arguably; these should have been sets(), [ then ] made into arrays or DataFrames.
#     (Memory versus CPU. And .. .. two processing loops versus one.)
#
#  Convert these to DataFrames and remove duplicates
#
 
df_UmlsEntityNodes             = pd.DataFrame.from_records(l_UmlsEntityNodes           ).drop_duplicates()
df_UmlsVocabularyNodes         = pd.DataFrame.from_records(l_UmlsVocabularyNodes       ).drop_duplicates()
   #
df_PatientVisitToEntityEdge_N  = pd.DataFrame.from_records(l_PatientVisitToEntityEdge_N).drop_duplicates()
df_PatientVisitToEntityEdge_S  = pd.DataFrame.from_records(l_PatientVisitToEntityEdge_S).drop_duplicates()
df_EntityToVocabularyEdge_N    = pd.DataFrame.from_records(l_EntityToVocabularyEdge_N  ).drop_duplicates()
df_EntityToVocabularyEdge_S    = pd.DataFrame.from_records(l_EntityToVocabularyEdge_S  ).drop_duplicates()


print("Number of PatientVisit nodes: %d   UmlsEntity nodes: %d   UmlsVocabulary nodes: %d" % ( len(df_PatientVisitNodes), len(df_UmlsEntityNodes), len(df_UmlsVocabularyNodes) ))
print("")
print("Edges PatientVisit --> UmlsEntity: %d   %s   UmlsEntity --> UmlsVocabulary: %d   %s" % (
   len(df_PatientVisitToEntityEdge_N), len(df_PatientVisitToEntityEdge_S),
   len(df_EntityToVocabularyEdge_N  ), len(df_EntityToVocabularyEdge_S  ) ))
print("")

print("--")    

#  Sample output,
#
#     Number of 'Error' input records: 0   Number of 'No Root Key' input records: 0
#     
#     Number of PatientVisit nodes: 90   UmlsEntity nodes: 3115   UmlsVocabulary nodes: 8860
#     
#     Edges PatientVisit --> UmlsEntity: 8479   8479   UmlsEntity --> UmlsVocabulary: 10976   10976


#  Step 04: Create the Graph

In [None]:

#  At this point we have several Pandas DataFrames; import them into the graph-
#
#     Why use Pandas DataFrames versus Dask ?
#     
#     .  The data is small enough
#     .  Dask DataFrames currently (beta) have to be sourced from a shared/public drive
#


from katana.remote import import_data


#  Just nodes
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_PatientVisitNodes,
      id_column             = "id",
      id_space              = "PatientVisit",  
      label                 = "PatientVisit",  
      ) 
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_UmlsEntityNodes,
      id_column             = "id",
      id_space              = "UmlsEntity",  
      label                 = "UmlsEntity",  
      ) 
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.nodes_dataframe(
      df_UmlsVocabularyNodes,
      id_column             = "id",
      id_space              = "UmlsVocabulary",  
      label                 = "UmlsVocabulary",  
      ) 
   df_importer.insert()
    

#  Just edges
#
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_N, 
      source_id_space       = "PatientVisit", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.node_id_property_name("id")                        #  This line is required when you are not inserting any Nodes, only Edges
   df_importer.insert()                                           #  A heterogeneous relationship, we need two inserts for each edge, because
      #                                                           #  the source and dest id spaces are different.
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_PatientVisitToEntityEdge_S, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "PatientVisit",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "VISIT_CONTAINS"
      )
   df_importer.node_id_property_name("id")
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_N, 
      source_id_space       = "UmlsEntity", 
      destination_id_space  = "UmlsVocabulary",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )
   df_importer.node_id_property_name("id")
   df_importer.insert()
      #
with import_data.DataFrameImporter(my_graph) as df_importer:   
   df_importer.edges_dataframe(
      df_EntityToVocabularyEdge_S, 
      source_id_space       = "UmlsVocabulary", 
      destination_id_space  = "UmlsEntity",   
      source_column         = "start_id",
      destination_column    = "end_id",
      type                  = "ALSO_CODED_AS"
      )
   df_importer.node_id_property_name("id")
   df_importer.insert()


      ###


display("Number of nodes: %d   Numbers of edges: %d" % ( my_graph.num_nodes(), my_graph.num_edges() ))


display("--")


#  Sample output,
#
#     'Number of nodes: 12065   Numbers of edges: 38910'


#  Step 00:  Checking Results

In [None]:

#  Produce a visual graph

from katana_visualization_widget import GraphVisOptions, NodeVisOption, EdgeVisOption, ANY

l_options = GraphVisOptions(
   node_options = [
      NodeVisOption("PatientVisit",   label="sample_name"),
      NodeVisOption("UmlsEntity",     label="id"         ),
      NodeVisOption("UmlsVocabulary", label="id"         ),
   ],
   #  edge_options = [
   #     EdgeVisOption(["VISIT_CONTAINS"], label="start_id"),
   #     EdgeVisOption(["ALSO_CODED_AS" ], label="start_id"),
   #  ]
   )
    
    
l_result = my_graph.query("""

   MATCH (n) - [r] -> (m)
   RETURN n, r, m
   LIMIT 1000 
   
   """,
   contextualize=True)

l_result.view(graph_vis_options = l_options)


<div> 
<img src="./01_Images/Results_01.png" alt="Drawing" style="width: 800px;"/>
</div>

In [None]:

#  Produce a visual graph

from katana_visualization_widget import GraphVisOptions, NodeVisOption, EdgeVisOption, ANY

l_options = GraphVisOptions(
   node_options = [
      NodeVisOption("PatientVisit",   label="sample_name"),
      NodeVisOption("UmlsEntity",     label="id"         ),
      NodeVisOption("UmlsVocabulary", label="id"         ),
   ],
   #  edge_options = [
   #     EdgeVisOption(["VISIT_CONTAINS"], label="start_id"),
   #     EdgeVisOption(["ALSO_CODED_AS" ], label="start_id"),
   #  ]
   )
    
    
l_result = my_graph.query("""

   MATCH (n: PatientVisit) - [r: VISIT_CONTAINS] - (m: UmlsEntity) - [s: ALSO_CODED_AS] -> (t: UmlsVocabulary)
   WHERE n.id = "4001"
   RETURN n, r, m, s, t
   
   """,
   contextualize=True)

l_result.view(graph_vis_options = l_options)


<div> 
<img src="./01_Images/Results_02.png" alt="Drawing" style="width: 800px;"/>
</div>

In [None]:

#  Checking the edges specifically ..
#

l_result = my_graph.query("""
   MATCH (n) - [r: VISIT_CONTAINS] - (m)
   RETURN COUNT(r)
   """,
   contextualize=False)
      #    
print(l_result)

l_result = my_graph.query("""
   MATCH (n: PatientVisit) - [r: VISIT_CONTAINS] -> (m: UmlsEntity)
   RETURN COUNT(r)
   """,
   contextualize=False)
      #    
print(l_result)

l_result = my_graph.query("""
   MATCH (m: UmlsEntity) - [r: VISIT_CONTAINS] -> (n: PatientVisit)
   RETURN COUNT(r)
   """,
   contextualize=False)
      #    
print(l_result)

#  Sample output,    (I don't think this is correct)
#
#        COUNT(r)
#     0     33916              #  This is 4x  8479
#     
#        COUNT(r)
#     0      8479
#     
#        COUNT(r)
#     0      8479


In [None]:

l_result = my_graph.query("""
   MATCH (m) - [s: ALSO_CODED_AS] - (t)
   RETURN COUNT(s)
   """,
   contextualize=False)
      #
print(l_result)

l_result = my_graph.query("""
   MATCH (m: UmlsEntity) - [s: ALSO_CODED_AS] -> (t: UmlsVocabulary)
   RETURN COUNT(s)
   """,
   contextualize=False)
      #
print(l_result)

l_result = my_graph.query("""
   MATCH (t: UmlsVocabulary) - [s: ALSO_CODED_AS] -> (m: UmlsEntity)
   RETURN COUNT(s)
   """,
   contextualize=False)
      #
print(l_result)

#  Sample output,    (I don't think this is correct)
#
#     0     33916              #  This is 4x  8479
#        COUNT(s)
#     0     43904              #  This is 4x  10976
#     
#        COUNT(s)
#     0     10976
#     
#        COUNT(s)
#     0     10976
