#  Setup: Display options

In [None]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


# Setup: Connectivity

In [None]:

from katana import remote

my_client = remote.Client()

print(my_client)


In [None]:

NUM_PARTITIONS  = 3
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


In [None]:

#  DELETE ALL GRAPHS

for l_database in my_client.databases():
   for l_graph in my_client.get_database(name=l_database.name).graphs_in_database():
      l_handle=my_client.get_database(name=l_database.name).get_graph_by_id(id=l_graph.graph_id)
      l_handle.delete()

for l_graph in my_client.graphs():
   print("GRAPH ID: ", l_graph.graph_id, "      GRAPH Version: ", l_graph.version)

print("--")


In [None]:

#  DELETE ALL DATABASES

for l_database in my_client.databases():
   if (l_database.name != "default"):
      my_client.get_database(name=l_database.name).delete_database()
      print("--")

for l_database in my_client.databases():
   print("DB ID: ", l_database.database_id, "     DB Name: ", l_database.name)


In [None]:

#  CREATE DATABASE

my_database = my_client.create_database(name=DB_NAME)

print(my_database.database_id)


In [None]:

#  CREATE GRAPH
#
my_graph = my_client.get_database(name=DB_NAME).create_graph(name=GRAPH_NAME, num_partitions=NUM_PARTITIONS)

print(my_graph)


# Step 01:  Process CSV files ..

In [43]:

#  This data was sourced from,
#     https://mtsamples.com/site/pages/browse.asp?type=96-Hematology%20-%20Oncology
#

#  Column headers,
#     id|medical_specialty|sample_name|description|body
#
df_data      = pd.read_csv("./10_Data/22_OncologyCallNotes.txt", header = "infer", delimiter = "|")


#  90 Real lines of data.  (Some data is multi-line, and enclosed in double quotes.)
#
print("Number of CSV input lines: %d" % (len(df_data)))
print("")


#   Above, the field titled "body" contains embedded new lines; remove those
#   and place in a new field titled, "body_1line"
#
df_data["body_1line"] = df_data.body.map(lambda x: str(x).replace("\n", "") )


l_cntr = 0
   #
for l_each in df_data.itertuples():
   l_cntr += 1
      #
   if (l_cntr < 5):
      print("Record number: %d   Sample name: %-44s   Short text: %s" % (l_each.id, l_each.sample_name[0:43], l_each.body_1line[0:60]))
        
        
print("")
print("--")

#  Sample output,
#
#     Number of CSV input lines: 90
#     
#     Record number: 4001   Sample name: 3-Dimensional Simulation                       Short text: This patient is undergoing 3-dimensionally planned radiation
#     Record number: 4002   Sample name: Adrenalectomy & Umbilical Hernia Repair        Short text: PREOPERATIVE DIAGNOSES1. Adrenal mass, right sided.2. Umbi
#     Record number: 4003   Sample name: Anaplastic Astrocytoma - Letter                Short text: Month DD, YYYYXYZRE: ABCMEDICAL RECORD#: 123Dear Dr. 
#     Record number: 4004   Sample name: Anemia & Leukemia Followup                     Short text: CHIEF COMPLAINT:1. Chronic lymphocytic leukemia (CLL).2. A


Number of CSV input lines: 90

Record number: 4001   Sample name: 3-Dimensional Simulation                       Short text: This patient is undergoing 3-dimensionally planned radiation
Record number: 4002   Sample name: Adrenalectomy & Umbilical Hernia Repair        Short text: PREOPERATIVE DIAGNOSES1. Adrenal mass, right sided.2. Umbili
Record number: 4003   Sample name: Anaplastic Astrocytoma - Letter                Short text: Month DD, YYYYXYZRE: ABCMEDICAL RECORD#: 123Dear Dr. XYZ:I s
Record number: 4004   Sample name: Anemia & Leukemia Followup                     Short text: CHIEF COMPLAINT:1. Chronic lymphocytic leukemia (CLL).2. Aut

--


In [42]:


import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./10_Data/05_katana-clusters-beta-d8605ac248e7.json"


   ###


import requests
from requests.structures import CaseInsensitiveDict

url = "https://healthcare.googleapis.com/v1/projects/katana-clusters-beta/locations/us-central1/services/nlp:analyzeEntities"

headers = CaseInsensitiveDict()
headers["Authorization"] = "Bearer " +
headers["Content-Type"] = "application/json"

data = """
{
    'nlpService':'projects/katana-clusters-beta/locations/us-central1/services/nlp',
    'documentContent':'${l_each}'
   }
"""


resp = requests.post(url, headers=headers, data=data)

print(resp.status_code)





--


In [None]:


curl -X POST \
   -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
   -H "Content-Type:application/json" \
   --data "{
    'nlpService':'projects/katana-clusters-beta/locations/us-central1/services/nlp',
    'documentContent':'${l_each}'
   }" \
   "https://healthcare.googleapis.com/v1/projects/katana-clusters-beta/locations/us-central1/services/nlp:analyzeEntities"





In [None]:

#  In a Web browser
#     
#     https://console.cloud.google.com/storage/browser/a9187b90-cluster/kind-mullet/Paypal_20M_1B/
#

from google.cloud import storage


l_client      = storage.Client()

l_bucket_str  = "a9187b90-cluster"
   #
l_bucket      = l_client.bucket(l_bucket_str)


#  Just nodes
#
l_nodes = []
   #
for l_each in l_bucket.list_blobs(prefix = "kind-mullet/Paypal_20M_1B/N"):
   print(l_each.name)
      #
   l_nodes.append("gs://" + l_bucket_str + "/" + l_each.name)

print(l_nodes)


print("--")


#  Just edges
#
l_edges       = []
   #
l_cntr        = 0                                   #  We'll use these next two lines for testing
l_edges_small = []
   #
for l_each in l_bucket.list_blobs(prefix = "kind-mullet/Paypal_20M_1B/f"):
   print(l_each.name)
      #
   l_edges.append("gs://" + l_bucket_str + "/" + l_each.name)
      #
   l_cntr += 1
   if (l_cntr < 3):
      l_edges_small.append("gs://" + l_bucket_str + "/" + l_each.name)

print(l_edges)
print(l_edges_small)


print("--")

#  Sample output,
#
#     kind-mullet/Paypal_20M_1B/NodeFile_0
#     ['gs://a9187b90-cluster/kind-mullet/Paypal_20M_1B/NodeFile_0']
#     --
#     kind-mullet/Paypal_20M_1B/fileAssetA_0
#     kind-mullet/Paypal_20M_1B/fileAssetA_1
#     kind-mullet/Paypal_20M_1B/fileAssetA_2
#     kind-mullet/Paypal_20M_1B/fileAssetA_3
#     kind-mullet/Paypal_20M_1B/fileAssetA_4
#     kind-mullet/Paypal_20M_1B/fileAssetA_5
#     kind-mullet/Paypal_20M_1B/fileAssetB_0
#     kind-mullet/Paypal_20M_1B/fileAssetB_1
#     kind-mullet/Paypal_20M_1B/fileAssetB_2
#     kind-mullet/Paypal_20M_1B/fileAssetB_3
#     kind-mullet/Paypal_20M_1B/fileFriendA_0
#     kind-mullet/Paypal_20M_1B/fileFriendB_0
#     kind-mullet/Paypal_20M_1B/fileFriendB_1
#     ['gs://a9187b90-cluster/kind-mullet/Paypal_20M_1B/fileAssetA_0', 'gs://a9187b90-cluster/kind-mullet/Paypal_20M_1B/fileAssetA_1', 'gs://a9187b90-cluster/kind-mullet/P  ...
#     ['gs://a9187b90-cluster/kind-mullet/Paypal_20M_1B/fileAssetA_0', 'gs://a9187b90-cluster/kind-mullet/Paypal_20M_1B/fileAssetA_1']
#     --


#  Step 02: Read CSVs into Dask DataFrame

In [None]:

import dask.dataframe as dd
import numpy as np

print("--")


In [None]:
%%time

#  Just nodes

#  Sample data,
#
#     Sent|counterparty|cntAppr|cntDecl|cntRiskDecl1|amtRiskDecl|cntRiskDecl2|sumRcvd|cntRcvd|cntDist|cntDecl2
#     0|N|N|N|N|XbRqfTcGJL9fxmNp4simTykwKr|qhBymw13PneK5hpntNVwjNwaYklX8P379JZo|Y|416|0|inactive|1941730454454|0|2018-01-24T01:09:18|9.07632e+09|1419493889|1894897856|2676929303.1430|6614868331.8687|8283173087.0037|6020979256.1980|1984359695|5077917542.6596|1923699935|1948138778|4876422147.1566|

df_nodes = dd.read_csv(
   l_nodes,
   delimiter = "|",
   dtype = {
      "Sent"          : np.dtype("O"),                         #  Change these if you wish,  Numpy datatypes
      "counterparty"  : np.dtype("O"),
      "cntAppr"       : np.dtype("O"),
      "cntDecl"       : np.dtype("O"),
      "cntRiskDecl1"  : np.dtype("O"),
      "amtRiskDecl"   : np.dtype("O"),
      "cntRiskDecl2"  : np.dtype("O"),
      "sumRcvd"       : np.dtype("O"),
      "cntRcvd"       : np.dtype("O"),
      "cntDist"       : np.dtype("O"),
      "cntDecl2"      : np.dtype("O"),
      })

print(len(df_nodes))

print("--")

#  Sample output,
#
#     20000000
#     --
#     CPU times: user 3min, sys: 18 s, total: 3min 18s
#     Wall time: 2min 48s


In [None]:
%%time

#  Just edges

#  Sample data,
#
#     Account1.id|Account2.id|type|date
#     16089173|16089175|FriendConB|2018-10-10T11:20:05

df_edges = dd.read_csv(
    
   #  l_edges,                                                   #  This would be all files, at once, in memory
   l_edges_small,
    
   delimiter = "|",
   dtype = {
      "Account1.id"   : np.dtype("O"),
      "Account2.id"   : np.dtype("O"),
      "type"          : np.dtype("O"),                           #  These next two column names are keywords in most worlds -- bad idea
      "date"          : np.dtype("O"),
      })

print(len(df_edges))

print("--")

#  Sample output,
#
#     200000000
#     --
#     CPU times: user 4min 25s, sys: 31.2 s, total: 4min 56s
#     Wall time: 4min 23s
