### Setup and import graph

In [None]:
######## setup variables ##########
gs_prefix="gs://anand-31-simple-files/customer-claims/customer-360"
#cluster_name="katana-internal31"
partitions = 5
platform_prefix=gs_prefix

In [None]:
import os                                                                        
import time                                                                      
import json
import pandas as pd
from timeit import default_timer as timer
from katana.remote import aio, analytics, import_data, trace

from katana import remote
from katana.remote import import_data

from katana.remote.analytics import connected_components, louvain_clustering

os.environ["KATANA_SERVER_ADDRESS"] = "host.docker.internal:8080"

pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)

In [None]:
rc=remote.Client()
rc.server_version

In [None]:

graph = rc.create_graph(
    num_partitions=partitions,
    name="customer-claims"
)

In [None]:
graph.graph_id

In [None]:
test_node_file = platform_prefix + "/42_NodeHeader.txt"
test_edge_file = platform_prefix + "/43_EdgeHeader.txt"
input_path=platform_prefix 


In [None]:
start = timer()
print("Importing graph from CSV files...")
import_data.csv(
    graph,
    input_node_path=test_node_file,
    input_edge_path=test_edge_file,
    input_dir=input_path,
#    have_headers=True,
    data_delimiter="|",
    schema_delimiter="|"
    #partition_policy="random-oec"
)
end = timer()
print(f"  import: {end - start:.1f} seconds") 

In [None]:
%%time
graph.num_nodes()

In [None]:
graph.num_edges()

### Open graph

In [None]:
import os

from katana import remote

os.environ["KATANA_SERVER_ADDRESS"] = "localhost:8080"

my_client = remote.Client()

print(my_client)

In [None]:
DB_NAME    = "my_db"
GRAPH_NAME = "my_graph"

print("--")

In [None]:
#  CONNECT TO GRAPH

for l_graph in my_client.get_database(name=DB_NAME).graphs_in_database():
   if (l_graph.name == GRAPH_NAME):
      # my_graph=my_client.get_database(name=DB_NAME).get_graph_by_id(id=l_graph.graph_id)
      graph = my_client.get_database(name=DB_NAME).get_graph_by_id(id=l_graph.graph_id)
        #
      break

# my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

# print(my_graph)
print(graph)

### Investigate the neighbourhood of an Claimant with high payouts

In [None]:

query1="""
        MATCH (p:claimant)   
        
        RETURN p
      
        """

result = graph.query(query1, contextualize=True)

result.view()






In [None]:
%%time
#High number of payouts
query1="""
        MATCH (p:claimant)-[]-(py:payment)
        RETURN count(py) as count, p.id order by count desc
        """
result = graph.query(query1, contextualize=True)
result.view()

<b> Let's have a look at the claimant neighborhood </b>

In [None]:
%%time
#Payments
query1="""
        MATCH (p:claimant)<-[]-(py:payment)
        where p.id = 'C_8900'
        RETURN p, py
        """
result = graph.query(query1, contextualize=True)
result.view()

In [None]:
%%time
#check Folks involved in common claim
query1="""
        MATCH (p:claimant)-[:MADE]->(c:claim)<-[:MADE]-(p2)
        where p.id = 'C_8900'
        RETURN p, c, p2
        """
result = graph.query(query1, contextualize=True)
result.view()

### Enriching the Topology: Duplicate Detection and Linking
We will pick claimants connected to C_8900 via common claims and look for duplicates

In [None]:
%%time
query1="""
        MATCH (p:claimant)-[]-(c:claim)<-[]-(p2:claimant)
        where p.id = 'C_8900'
        RETURN distinct(p2) order by p2.firstname
        """
result = graph.query(query1)
result.view()

In [None]:
candidates = []
for e in result.iterrows():
    candidate = dict()
    candidate['firstname'] = e[1]['p2.firstname']
    candidate['lastname'] = e[1]['p2.lastname']
    candidate['id'] = e[1]['p2.id']
    candidate['address'] = e[1]['p2.address']
    candidates.append(candidate)

In [None]:
len(candidates)

In [None]:
import difflib
par1 = 'Daniel Williams'
par2 = 'Daniel William'
score = difflib.SequenceMatcher(None, par1, par2).ratio()
score

In [None]:
def write_duplicates(graph, candidates):
    import csv
    import difflib
    print("Finding duplicates")
    import socket
    print(socket.gethostname())
    
    pos = int(socket.gethostname().replace('katana-compute-',''))
    lastIndex = len(candidates) - 1
    print('Last position of candidates to compare', lastIndex)
    duplicates=[]
    while pos < lastIndex:
        nameOfSource = candidates[pos]['firstname'] + candidates[pos]['lastname']
        addressOfSource = candidates[pos]['address']
        idOfSource = candidates[pos]['id']
        looper = 0
        while looper < lastIndex:
            idOfTarget = candidates[looper]['id']
            if idOfSource == idOfTarget:
                looper = looper + 1
                continue
            nameOfTarget = candidates[looper]['firstname'] + candidates[looper]['lastname']
            addressOfTarget = candidates[looper]['address']
            score = difflib.SequenceMatcher(None, nameOfSource, nameOfTarget).ratio()
            score_address = difflib.SequenceMatcher(None, addressOfSource, addressOfTarget).ratio()
            if(score > 0.8) and (score_address > 0.8):
                print('duplicates', idOfSource, idOfTarget)
                print('addressand score', addressOfSource, addressOfTarget, score_address)
                duplicate = dict()
                duplicate['idOfSource'] = idOfSource
                duplicate['idOfTarget'] = idOfTarget
                duplicates.append(duplicate)
            looper = looper + 1
        pos = pos + 5
    print('processing complete for node', socket.gethostname())
    print('last position compared', pos)
    
    outFileName = '/tmp/dups-1.csv'
    fields = ['idOfSource', 'idOfTarget']
    with open(outFileName, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields, delimiter='|')
        writer.writeheader()
        if (len(duplicates) > 0): 
            writer.writerows(duplicates)
    print('finished writing duplicates')
    
#graph.run(lambda g: dist_count(g, 'argument1', 'argument2', 'argument3'))
#candidates = ['argument1', 'argument2', 'argument3','argument4', 'argument5']
graph.run(lambda g: write_duplicates(g, candidates))


In [None]:
duplicates = []
duplicates.append(['C_312900','C_84200'])
duplicates.append(['C_236100','C_16800'])
duplicates.append(['C_106500','C_216400'])
len(duplicates)

In [None]:
query1="""
        MATCH (p1:Claimant),(p2:Claimant),(p3:Claimant),(p4:Claimant),(p5:Claimant),(p6:Claimant)
        where p1.id = '_d1_'
          and p2.id = '_d2_'
          and p3.id = '_d3_'
          and p4.id = '_d4_'
          and p5.id = '_d5_'
          and p6.id = '_d6_'
        with p1, p2, p3, p4, p5, p6
        create (p1)-[:isDuplicatedBy]->(p2),
               (p3)-[:isDuplicatedBy]->(p4),
               (p5)-[:isDuplicatedBy]->(p6)
        RETURN p1, p2, p3, p4, p5, p6
        """
query1 = query1.replace('_d1_', 'C_312900')
query1 = query1.replace('_d2_', 'C_84200')
query1 = query1.replace('_d3_', 'C_236100')
query1 = query1.replace('_d4_', 'C_16800')
query1 = query1.replace('_d5_', 'C_106500')
query1 = query1.replace('_d6_', 'C_216400')

print(query1)
result = graph.query(query1)

In [None]:
%%time
#check duplicate objects
query1="""
        MATCH (p1:Claimant)-[:isDuplicatedBy]->(p2:Claimant)
        RETURN p1, p2
        """
result = graph.query(query1, contextualize=True)
result.view()

## Revisit the neighborhood and check for a fraudulent patterns

In [None]:
%%time
#check Folks involved in common claim
query1="""
        MATCH (p:Claimant)-[:MADE]->(c:Claim)<-[e:MADE]-(p2)
        where p.id = 'C_8900'
        with p, c, p2
        MATCH (p:Claimant)-[:MADE]->(c:Claim)<-[e1:MADE]-(p2)
        where e1.ROLE = 'witness'
        RETURN p, c, p2
        """
result = graph.query(query1, contextualize=True)
result.view()

## Project the claimant-claimant relationship, and then revisit the neighborhood of C_8900

In [None]:
%%time
query1="""
        MATCH (p1:Claimant)-[]-(c:Claim)-[]-(p2:Claimant)
        with p2,p1, count(c) as common_claim_count 
        create (p1)-[:common_claim{count: common_claim_count}]->(p2)
        """
result = graph.query(query1)
result.view()

## Pagerank on whole graph

In [None]:
from katana.remote.analytics import pagerank
pagerank(graph, 'pg2')

In [None]:
%%time
#check top nodes
query1="""
        MATCH (p)
        where exists(p.pg2)
        RETURN p order by p.pg2 desc  limit 5
        """
result = graph.query(query1)
result.view()

In [None]:
%%time
#check top nodes
query1="""
        MATCH (att:Attorney)
        where att.id = 'X_4853'
        RETURN att
        """
result = graph.query(query1)
result.view()

In [None]:
%%time
#Get suspicious claims
query1="""
        MATCH (p1:Claimant)-[:isDuplicatedBy]-(p2:Claimant)-[MADE]-(cl:Claim)-[:REPRESENTS]->(att:Attorney)
        where att.id = 'X_4853'
        RETURN p1,p2,cl,att
        """
result = graph.query(query1, contextualize=True)
result.view()

## Page rank on projected graph

In [None]:
gp4 = graph.project(node_types = ['Claimant'], edge_types = ['common_claim', 'isDuplicatedBy'])
from katana.remote.analytics import pagerank
pagerank(gp4, 'pgp')

In [None]:
%%time
#check transaction objects
query1="""
        MATCH (p:Claimant)
        where exists(p.pgp)
        with p order by p.pgp desc  limit 20
        Match(p:Claimant)-[]-(p1:Claimant)-[:isDuplicatedBy]-(p2:Claimant)
        with distinct(p)
        match (p:Claimant)-[:MADE]->(q:Claim)
        return p,q 
        """
result = graph.query(query1, contextualize=True)
result.view()

In [None]:
%%time
#check transaction objects
query1="""
        MATCH (p:Claimant)
        where exists(p.pgp)
        with p order by p.pgp desc  limit 20
        Match(p:Claimant)-[]-(p1:Claimant)-[:isDuplicatedBy]-(p2:Claimant)
        with distinct(p)
        match (p:Claimant)-[:common_claim]->(q:Claimant)
        return p,q 
        """
result = graph.query(query1, contextualize=True)
result.view()

## Appendix

In [None]:
from katana.remote.analytics import connected_components
connected_components(graph, 'cc1')

In [None]:
%%time
#Get suspicious claims
query1="""
        MATCH (p1:Claimant)
        where exists(p1.cc1)
        RETURN p1 limit 5
        """
result = graph.query(query1, contextualize=True)
result.view()

In [None]:
%%time
#Get suspicious claims
query1="""
        MATCH (p1:Claimant)
        where exists(p1.cc1)
        RETURN count(p1) as size, p1.cc1 order by size desc limit 10
        """
result = graph.query(query1, contextualize=True)
result.view()

In [None]:
%%time
#Get suspicious claims
query1="""
        MATCH (p1:Claimant)
        where p1.cc1 = 169547
        RETURN p1
        """
result = graph.query(query1, contextualize=True)
result.view()

In [None]:
gp3 = graph.project(node_types = ['Claimant'], edge_types = ['common_claim', 'isDuplicatedBy'])

In [None]:
connected_components(gp3, 'cc3')

In [None]:
%%time
#Get suspicious claims
query1="""
        MATCH (p1:Claimant)
        where exists(p1.cc3)
        RETURN tointeger(count(p1)) as size, tointeger(p1.cc3) order by size desc limit 10
        """
result = graph.query(query1, contextualize=True)
result.view()

In [None]:
%%time
#Get suspicious claims
query1="""
        MATCH (p1:Claimant)
        where p1.cc2 = 152801
        RETURN p1
        """
result = graph.query(query1, contextualize=True)
result.view()

In [None]:
gp4 = graph.project(node_types = ['Claimant'], edge_types = ['common_claim', 'isDuplicatedBy'])
from katana.remote.analytics import pagerank
pagerank(gp4, 'pg2')

In [None]:
%%time
#check transaction objects
query1="""
        MATCH (p:Claimant)
        where exists(p.pg2)
        with p order by p.pg2 desc  limit 12
        Match(p:Claimant)-[]-(p1:Claimant)-[:isDuplicatedBy]-(p2:Claimant)
        with distinct(p)
        match (p:Claimant)-[:common_claim]->(q:Claimant)
        return p, q
        """
result = graph.query(query1, contextualize=True)
result.view()

### Louvain

In [None]:
gp2 = graph.project(node_types = ['Claimant'], edge_types = ['common_claim', 'isDuplicatedBy'])

In [None]:
%%time
#check transaction objects
query1="""
        MATCH (p:Claimant)
        where p.id = 'C_8900'
        RETURN p
        """
result = gp2.query(query1, contextualize=True)
result.view()

In [None]:
louvain_clustering(gp2,'lv3',  is_symmetric=True)

In [None]:
from katana.remote.analytics import connected_components
connected_components(gp, 'cc2')

In [None]:
%%time
#check transaction objects
query1="""
        MATCH (p:Claimant)
        where p.id = 'C_8900'
        RETURN p
        """
result = graph.query(query1)
result.view()

In [None]:
%%time
#check transaction objects
query1="""
        MATCH (p1)-[]-(p2)
        where tointeger(p1.cc2) = 0
        return p1, p2
        """
result = graph.query(query1, contextualize = True)
result.view()

In [None]:

%%time
#check transaction objects
query1="""
        CALL graph.schema() return *;
        """
result = graph.query(query1)
result.view()

In [None]:
result.to_csv()