
First make sure to run your ssh tunnel with:  
GCP: 
  `gcloud compute ssh --zone us-east1-b --project katana-internal31 katana-controller -- -NL 8080:127.0.0.1:8080 -vvv`
<br> <br>
AWS: 
  `ssh -i ~/.ssh/id_rsa -N -L 8080:54.162.203.170:8080 ubuntu@54.162.203.170 -vvv` 
<br><br>  

In [1]:
######## setup variables ##########
gs_prefix="gs://anand-31-simple-files/customer-claims/customer-360"
#cluster_name="katana-internal31"
partitions = 5
platform_prefix=gs_prefix

In [2]:
import os                                                                        
import time                                                                      
import json
import pandas as pd
from timeit import default_timer as timer
from katana.remote import aio, analytics, import_data, trace

from katana import remote
from katana.remote import import_data


os.environ["KATANA_SERVER_ADDRESS"] = "host.docker.internal:8080"

pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)

In [3]:
rc=remote.Client()
rc.server_version

'0.2.2.dev+qa.3.36.62f045.93916a'

In [4]:

graph = rc.create_graph(
    num_partitions=partitions,
    name="customer-claims"
)

In [5]:
graph.graph_id

'C8hTX7gRueJhoDFMT8rGtrSJxQCebxdcTKwiWh9GnbnW'

In [6]:
test_node_file = platform_prefix + "/42_NodeHeader.txt"
test_edge_file = platform_prefix + "/43_EdgeHeader.txt"
input_path=platform_prefix 


In [7]:
start = timer()
print("Importing graph from CSV files...")
import_data.csv(
    graph,
    input_node_path=test_node_file,
    input_edge_path=test_edge_file,
    input_dir=input_path,
#    have_headers=True,
    data_delimiter="|",
    schema_delimiter="|"
    #partition_policy="random-oec"
)
end = timer()
print(f"  import: {end - start:.1f} seconds") 

Importing graph from CSV files...


          0/? [?op/s]

  import: 43.4 seconds


In [8]:
%%time
graph.num_nodes()

          0/? [?op/s]

CPU times: user 365 ms, sys: 129 ms, total: 493 ms
Wall time: 7.51 s


2901419

<b> Import ends here: DATA LOADING COMPLETE </b>

In [51]:
%%time
#check transaction objects
query1="""
        MATCH (p:Claimant)-[]->(c:Claim)
        where p.id = 'C_8900'
        RETURN p, c
        """
result = graph.query(query1, contextualize=True)
result.view()

          0/? [?op/s]

          0/? [?op/s]

CPU times: user 3.41 s, sys: 806 ms, total: 4.22 s
Wall time: 28.3 s


VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .result_set_tab_w…

In [24]:
%%time
#check transaction objects
query1="""
        MATCH (l:Location)<-[]-(p:Provider)<-[]-(m:Mbr)
        where l.id = 'AL_0'
        with l,p,m
        MATCH (m)-[]->(po:Policy)-[]->(a:Agent)-[]->(l:Location)
        RETURN l,p, m, po, a
        """
result = graph.query(query1, contextualize=True)
result.view()

          0/? [?op/s]

          0/? [?op/s]

CPU times: user 4.18 s, sys: 627 ms, total: 4.81 s
Wall time: 38.8 s


VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [17]:
%%time
#check transaction objects
query1="""
        MATCH (l:Location)<-[]-(p:Provider)<-[]-(m:Mbr)
        where l.id = 'AL_0'
        with l,p,m
        MATCH (m:Mbr)-[]->(po:Policy)-[]->(a:Agent)-[]->(l:Location)
        with m, l
        create (m)<-[:POLICY_SOURCED_FOR]-(l)
        return m
        """
result = graph.query(query1)
result.view()

          0/? [?op/s]

CPU times: user 942 ms, sys: 152 ms, total: 1.09 s
Wall time: 16 s


VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [None]:
%%time
#check transaction objects
query1="""
        MATCH (l:Location)-[]-(m:Mbr)
        where l.id = 'AL_0'
        with l,m
        MATCH (l)-[]-(p:Provider)
        RETURN l,p, m
        """
result = graph.query(query1, contextualize=True)
result.view()

## Triangle count

In [18]:
from katana.remote.analytics import louvain_clustering
louvain_clustering(graph, 'lv')

          0/? [?op/s]

In [10]:
from katana.remote.analytics import triangle_count
tc = triangle_count(graph)
tc

          0/? [?op/s]

Unnamed: 0,num_triangles
0,0


### Duplicate Detection and Tagging

## C_8900 has an abnormally large amount of claims pending

In [9]:
%%time
#check transaction objects
query1="""
        MATCH (p:Claimant)-[]-(c:Claim)<-[]-(p2:Claimant)
        where p.id = 'C_8900'
        RETURN distinct(p2) order by p2.firstname
        """
result = graph.query(query1)
result.view()

          0/? [?op/s]

CPU times: user 2.42 s, sys: 513 ms, total: 2.94 s
Wall time: 12.7 s


VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .result_set_tab_w…

In [10]:
candidates = []
for e in result.iterrows():
    candidate = dict()
    candidate['firstname'] = e[1]['p2.firstname']
    candidate['lastname'] = e[1]['p2.lastname']
    candidate['id'] = e[1]['p2.id']
    candidates.append(candidate)

In [11]:
candidates

[{'firstname': 'Abigail', 'lastname': 'Williams', 'id': 'C_32300'},
 {'firstname': 'Adam', 'lastname': 'Lamb', 'id': 'C_334800'},
 {'firstname': 'Alexandria', 'lastname': 'Walker', 'id': 'C_12200'},
 {'firstname': 'Amber', 'lastname': 'Robinson', 'id': 'C_216400'},
 {'firstname': 'Amber', 'lastname': 'Robison', 'id': 'C_106500'},
 {'firstname': 'Angela', 'lastname': 'Joseph', 'id': 'C_459800'},
 {'firstname': 'Anita', 'lastname': 'Barber', 'id': 'C_276000'},
 {'firstname': 'Anna', 'lastname': 'Collins', 'id': 'C_319700'},
 {'firstname': 'Ashley', 'lastname': 'Hayden', 'id': 'C_476000'},
 {'firstname': 'Barbara', 'lastname': 'Adkins', 'id': 'C_108500'},
 {'firstname': 'Bonnie', 'lastname': 'Palmer', 'id': 'C_77900'},
 {'firstname': 'Brandon', 'lastname': 'Holden', 'id': 'C_435400'},
 {'firstname': 'Brittany', 'lastname': 'Sharp', 'id': 'C_465900'},
 {'firstname': 'Catherine', 'lastname': 'White', 'id': 'C_346100'},
 {'firstname': 'Cheryl', 'lastname': 'Lowery', 'id': 'C_336800'},
 {'fir

In [12]:
import difflib
par1 = 'Daniel Williams'
par2 = 'Daniel William'
score = difflib.SequenceMatcher(None, par1, par2).ratio()
score

0.9655172413793104

In [13]:
def write_duplicates(graph, candidates):
    import csv
    import difflib
    print("Finding duplicates")
    import socket
    print(socket.gethostname())
    print(graph.num_master_nodes(), graph.num_global_nodes())
    
    pos = int(socket.gethostname().replace('katana-compute-',''))
    lastIndex = len(candidates) - 1
    print('Last position of candidates to compare', lastIndex)
    duplicates=[]
    while pos < lastIndex:
        nameOfSource = candidates[pos]['firstname'] + candidates[pos]['lastname']
        idOfSource = candidates[pos]['id']
        looper = 0
        while looper < lastIndex:
            idOfTarget = candidates[looper]['id']
            if idOfSource == idOfTarget:
                looper = looper + 1
                continue
            nameOfTarget = candidates[looper]['firstname'] + candidates[looper]['lastname']
            score = difflib.SequenceMatcher(None, nameOfSource, nameOfTarget).ratio()
            if(score > 0.8):
                print('duplicates', idOfSource, idOfTarget)
                duplicate = dict()
                duplicate['idOfSource'] = idOfSource
                duplicate['idOfTarget'] = idOfTarget
                duplicates.append(duplicate)
            looper = looper + 1
        pos = pos + 5
    print('processing complete for node', socket.gethostname())
    print('last position compared', pos)
    
    outFileName = '/tmp/dups-1.csv'
    fields = ['idOfSource', 'idOfTarget']
    with open(outFileName, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields, delimiter='|')
        writer.writeheader()
        if (len(duplicates) > 0): 
            writer.writerows(duplicates)
    print('finished writing duplicates')
    
#graph.run(lambda g: dist_count(g, 'argument1', 'argument2', 'argument3'))
#candidates = ['argument1', 'argument2', 'argument3','argument4', 'argument5']
graph.run(lambda g: write_duplicates(g, candidates))


          0/? [?op/s]

Finding duplicates
katana-compute-1
578842 2901419
Last position of candidates to compare 76
duplicates C_312900 C_84200
processing complete for node katana-compute-1
last position compared 76
finished writing duplicates


In [15]:
duplicates = []
duplicates.append(['C_312900','C_84200'])
duplicates.append(['C_236100','C_16800'])
duplicates.append(['C_106500','C_216400'])
len(duplicates)

3

In [16]:
%%time
for duplicate in duplicates:
    query1="""
        MATCH (p1:Claimant),(p2:Claimant)
        where p1.id = '_d1_'
          and p2.id = '_d2_'
        with p1, p2
        create (p1)-[:isDuplicatedBy]->(p2)
        RETURN p1, p2
        """
    query1 = query1.replace('_d1_', duplicate[0])
    query1 = query1.replace('_d2_', duplicate[1])
    print(query1)
    result = graph.query(query1)



        MATCH (p1:Claimant),(p2:Claimant)
        where p1.id = 'C_312900'
          and p2.id = 'C_84200'
        with p1, p2
        create (p1)-[:isDuplicatedBy]->(p2)
        RETURN p1, p2
        


          0/? [?op/s]


        MATCH (p1:Claimant),(p2:Claimant)
        where p1.id = 'C_236100'
          and p2.id = 'C_16800'
        with p1, p2
        create (p1)-[:isDuplicatedBy]->(p2)
        RETURN p1, p2
        


          0/? [?op/s]


        MATCH (p1:Claimant),(p2:Claimant)
        where p1.id = 'C_106500'
          and p2.id = 'C_216400'
        with p1, p2
        create (p1)-[:isDuplicatedBy]->(p2)
        RETURN p1, p2
        


          0/? [?op/s]

CPU times: user 8.18 s, sys: 2.45 s, total: 10.6 s
Wall time: 59 s


In [17]:
%%time
#check transaction objects
query1="""
        MATCH (p1:Claimant)-[:isDuplicatedBy]->(p2:Claimant)
        RETURN p1, p2
        """
result = graph.query(query1, contextualize=True)
result.view()

          0/? [?op/s]

          0/? [?op/s]

CPU times: user 3.29 s, sys: 1.03 s, total: 4.31 s
Wall time: 35.7 s


VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .result_set_tab_w…