In [1]:
######## setup variables ##########
gs_prefix="gs://anand-31-simple-files/aml-customer-extended"
cluster_name="katana-internal31"
partitions = 1
platform_prefix=gs_prefix

In [2]:
import os                                                                        
import time                                                                      
import json
import pandas as pd
from timeit import default_timer as timer
from katana.remote import aio, analytics, import_data, trace

from katana import remote
from katana.remote import import_data

import gcsfs, pytest

os.environ["KATANA_SERVER_ADDRESS"] = "host.docker.internal:8080"

pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)

In [3]:
rc=remote.Client()
rc.server_version

'0.1.102.dev+96.327.e672f3.0227ed.unmerged'

<h1> Part 1: Validate Pre-requisites and Import Data </h1>

In [4]:

graph = rc.create_graph(
    num_partitions=partitions,
    name="aml-1"
)

In [5]:
graph.graph_id

'AoYtkBuEMN8ZdYTxgVtj9PAc1NpgPET9LfMcrFrstrbg'

In [6]:
test_node_file = platform_prefix + "/nodes.txt"
test_edge_file = platform_prefix + "/edges.txt"
input_path=platform_prefix 
rdg_path="gs://"+cluster_name+"/graph/"+graph.graph_id

In [7]:
fs = gcsfs.GCSFileSystem(project="katana-internal31", token="anon")

In [8]:
assert fs.exists(test_node_file), "Node file does not exist"
assert fs.exists(test_edge_file), "Edge file does not exist"
assert len(graph.graph_id)>0, "Graph object is not available"

<b>Pre-requisites for the test is now complete</b>

In [9]:
start = timer()
print("Importing graph from CSV files...")
import_data.csv(
    graph,
    input_node_path=test_node_file,
    input_edge_path=test_edge_file,
    input_dir=input_path,
#    have_headers=True,
    data_delimiter=",",
    schema_delimiter=","
    #partition_policy="random-oec"
)
end = timer()
print(f"  import: {end - start:.1f} seconds") 

Importing graph from CSV files...


          0/? [?op/s]

  import: 17.4 seconds


In [10]:
assert fs.exists(rdg_path), "Output path for RDG does not exists"

In [11]:
assert graph.num_nodes() > 0, "No Nodes exists in the graph"
assert graph.num_edges() > 0, "No Edges exists in the graph"

          0/? [?op/s]

          0/? [?op/s]

<b> 1 : CSV Import Functionality works </b>

<h1> Part 2: Review the Entities in the AML Domain </h1>

In [12]:
%%time
query1="""MATCH (r:ACCOUNT{ACCOUNT_ID:'926'}) 
       return r"""
result = graph.query(query1)
df = result.to_pandas()

          0/? [?op/s]

CPU times: user 119 ms, sys: 36.9 ms, total: 156 ms
Wall time: 9.14 s


In [13]:
result.table()

GridBox(children=(HBox(children=(Text(value='', placeholder='Search....'), HBox(children=(Button(description='…

In [14]:
assert df.size == 1, "Exactly one Account node is not found"

<b> 2 : Lookup of an ACCOUNT by label works </b>

In [15]:
%%time
query1="""MATCH (s:ACCOUNT{ACCOUNT_ID:'926'})-[ts:TRANSACTION_SENDER]->(t:TRANSACTION)-[tr:TRANSACTION_RECEIVER]->(r:ACCOUNT)
       return t"""
result = graph.query(query1)
df = result.to_pandas()
df.size

          0/? [?op/s]

CPU times: user 95.5 ms, sys: 25.1 ms, total: 121 ms
Wall time: 8.76 s


20

In [16]:
assert df.size > 1, "A transaction linked to an account with id 926 is not found"

<b> 3 : Lookup of an TRANSACTION related to an ACCOUNT (926) successful </b>

In [17]:
%%time
query1="""MATCH (s:ACCOUNT{ACCOUNT_ID:'926'})-[ac:HAS_CUSTOMER]->(c:CUSTOMER)
       return c"""
result = graph.query(query1)
df = result.to_pandas()
df

          0/? [?op/s]

CPU times: user 86 ms, sys: 44.7 ms, total: 131 ms
Wall time: 8.84 s


Unnamed: 0,c
0,"{'id': 1926, 'labels': ['CUSTOMER'], 'properties': {'CUSTOMER_ID': 'C_926', 'DOB': '2005-10-12', 'FIRSTNAME': 'Tyrone', 'LASTNAME': 'Nelson', 'PEP': 'false'}, 'type': 'node'}"


In [18]:
assert df.size == 1, "One and only one CUSTOMER is linked to an ACCOUNT"

<b> 4 : There is one and only one customer node linked to ACCOUNT (926) </b>

<h1> Part 3: AML Queries </h1>

In [19]:
%%time
query1="""MATCH (a:ACCOUNT)-[:TRANSACTION_SENDER]->(t:TRANSACTION)-[:TRANSACTION_RECEIVER]->(b:ACCOUNT) 
            where t.dot > date('2018-01-01') and t.dot < date('2022-01-01')
              and t.TX_AMOUNT < 10000
        WITH  collect(t.TX_ID) as txs, avg(t.TX_AMOUNT) as average, count(t) as count, a.ACCOUNT_ID as sender,b.ACCOUNT_ID as receiver 
        where average > 9000 
        and count > 5  
        return  txs, average, count, sender, receiver order by count desc"""

result = graph.query(query1)

          0/? [?op/s]

CPU times: user 85.8 ms, sys: 24.4 ms, total: 110 ms
Wall time: 7.29 s


In [20]:
df=result.to_pandas()
df.shape[0]

2

In [21]:
assert df.shape[0] == 2, "'Under the Radar' Query should yield 2 records"

<b> 5 : Under the Radar' Query yielded 2 records </b>

In [23]:
%%time
query1="""MATCH (a)-[:TRANSACTION_SENDER]->(t1:TRANSACTION)-[:TRANSACTION_RECEIVER]->(b)-[:TRANSACTION_SENDER]->(t2:TRANSACTION)-[:TRANSACTION_RECEIVER]->(c)
        where t1.TX_AMOUNT = t2.TX_AMOUNT
          and NOT a.ACCOUNT_ID = b.ACCOUNT_ID
          and NOT b.ACCOUNT_ID = c.ACCOUNT_ID
        WITH a, b, c, t1,t2
        MATCH  (c)-[:TRANSACTION_SENDER]->(t3:TRANSACTION)-[:TRANSACTION_RECEIVER]->(d)
        where t3.TX_AMOUNT = t2.TX_AMOUNT
          and NOT d.ACCOUNT_ID = c.ACCOUNT_ID
        WITH a, b, c, d, t1,t2, t3
        MATCH  (d)-[:TRANSACTION_SENDER]->(t4:TRANSACTION)-[:TRANSACTION_RECEIVER]->(e)
        where t4.TX_AMOUNT = t3.TX_AMOUNT
          and NOT e.ACCOUNT_ID = d.ACCOUNT_ID
        WITH a, b, c, d, e, t1,t2, t3, t4
        MATCH  (e)-[:TRANSACTION_SENDER]->(t5:TRANSACTION)-[:TRANSACTION_RECEIVER]->(f)
        where t5.TX_AMOUNT = t4.TX_AMOUNT
          and NOT f.ACCOUNT_ID = e.ACCOUNT_ID
          and f.ACCOUNT_ID = a.ACCOUNT_ID
          and ( a.ACCOUNT_ID = '753' or b.ACCOUNT_ID = '753' or c.ACCOUNT_ID = '753' or d.ACCOUNT_ID = '753' or e.ACCOUNT_ID = '753' or f.ACCOUNT_ID = '753' )      
        return a, b, c, d, e, f, t1,t2, t3, t4, t5 
        """

result = graph.query(query1, contextualize = True)
cycles=result.to_pandas()
result.visualize()

          0/? [?op/s]

          0/? [?op/s]

CPU times: user 255 ms, sys: 98.8 ms, total: 354 ms
Wall time: 26.9 s


GraphVizWidget(edges=[{'from': 119615, 'to': 98, 'label': 'TRANSACTION_RECEIVER', 'type': 'TRANSACTION_RECEIVE…

In [27]:
cycles.shape[0]

10

In [28]:
assert df.shape[0] > 0, "At least one cycle involving account 753 found"

<b>5 : Cycle query returned at least one cycle</b>

<h1> PART 4 : Adding an Alert and Marking a transaction as suspicious  </h1>

In [29]:
%%time
query1="""MATCH (t:TRANSACTION)  
        WHERE t.TX_ID = '210049'
        CREATE (t)<-[:HAS_ALERT]-(alert:ALERT{ALERT_ID:'10001', DESCRIPTION: 'High Value with cycle links'})
        return alert"""
result = graph.query(query1)

          0/? [?op/s]

CPU times: user 201 ms, sys: 63.8 ms, total: 265 ms
Wall time: 9.45 s


Unnamed: 0,alert
0,"{'id': 157927, 'labels': ['ALERT'], 'properties': {'ALERT_ID': '10001', 'DESCRIPTION': 'High Value with cycle links'}, 'type': 'node'}"


In [45]:
df=result.to_pandas()
d=df['alert'][0]
d['properties']['ALERT_ID']

'10001'

In [46]:
assert d['properties']['ALERT_ID'] == '10001', "Alert with id 10001 has not been created"

<b>6 : An Alert was successfully inserted into the database</b>

In [47]:
%%time
query1="""MATCH (a:ACCOUNT)-[:HAS_CUSTOMER]->(c:CUSTOMER)  
        WHERE a.ACCOUNT_ID = '256'
        SET c.SUSPICIOUS = 'true'
        return c"""
result = graph.query(query1)
c=result.to_pandas()

          0/? [?op/s]

CPU times: user 199 ms, sys: 66 ms, total: 265 ms
Wall time: 13 s


In [58]:
c['c'][0]['properties']['SUSPICIOUS']

'true'

In [59]:
assert c['c'][0]['properties']['SUSPICIOUS'] == 'true', "Suspicious flag for Customer 256 should be set to true"

<b>7 : Customer '256' was set as Suspicious</b>