In [1]:
######## setup variables ##########
#gs_prefix="gs://anand-31-simple-files/aml-customer-extended"
gs_prefix = "gs://anand-31-simple-files/partition-test/"
partitions = 3
platform_prefix=gs_prefix

In [2]:
import os                                                                        
import time                                                                      
import json
import pandas as pd
from timeit import default_timer as timer
from katana.remote import aio, analytics, import_data, trace

from katana import remote
from katana.remote import import_data
from katana.remote import PartitionPolicy

os.environ["KATANA_SERVER_ADDRESS"] = "host.docker.internal:8080"

pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)

In [3]:
rc=remote.Client()
rc.server_version

'0.3.0.dev+7767.7767.08ac50.08ac50'

In [4]:

graph = rc.create_graph(
    num_partitions=partitions,
    name="partition_test"
)

In [5]:
graph.graph_id

'sfYV5QhTeo4eVR7acaXgZ9rJoxm4UXKWTV8ji9sCEb9'

In [6]:
test_node_file = platform_prefix + "nodes.txt"
test_edge_file = platform_prefix + "edges.txt"
input_path=platform_prefix 


In [7]:
start = timer()
print("Importing graph from CSV files...")
import_data.csv(
    graph,
    input_node_path=test_node_file,
    input_edge_path=test_edge_file,
    input_dir=input_path,
    partition_policy=PartitionPolicy.BLOCKED_OEC,
#    have_headers=True,
    data_delimiter=",",
    schema_delimiter=","
)
end = timer()
print(f"  import: {end - start:.1f} seconds") 

Importing graph from CSV files...


          0/? [?op/s]

  import: 5.7 seconds


In [102]:
%%time
graph.num_nodes()

          0/? [?op/s]

CPU times: user 96.1 ms, sys: 22.2 ms, total: 118 ms
Wall time: 4.54 s


1513

In [91]:
graph.num_edges()

          0/? [?op/s]

0

In [152]:
query = """
    MATCH(c:CUSTOMER)-[]->(x)
    where c.ID = 'c_299'
    return c, x
"""
result = graph.query(query, contextualize=True)
result.view()

          0/? [?op/s]

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [46]:
query = """
    MATCH(t:PROFILE_SERVER_TITLE)
    return t
"""
result = graph.query(query)
result.view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [11]:
query = """
    MATCH(c:CUSTOMER)-[:KNOWS]->(q:CUSTOMER)
    where c.ID = 'c_25'
    return c.ID, q.ID
"""
result = graph.query(query)
result.view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .result_set_tab_w…

<b> Import ends here: DATA LOADING COMPLETE </b>

In [8]:
def visit(graph, nodeLabel, listOfAttributes):
    output=[]
    id_property = graph.get_node_property('ID')
    with open('/tmp/' + 'node_blocked.txt', 'w') as f:
        for node in graph.master_nodes():
            output.append(str(id_property[node]))
            f.write(str(id_property[node])+ '#\n')
        for node in graph.mirror_nodes():
            f.write(str(id_property[node])+ '#m#\n')
    return output
# graph.run(lambda g: dist(g))
out = graph.run_result(lambda g: visit(g, 'PERSON', ['name']))

          0/? [?op/s]

In [12]:
out.value[0:4]

['c_0', 'c_1', 'c_2', 'c_3']

In [110]:
def check_address_length(graph, nodeLabel, listOfAttributes):
    output=[]
    id_property = graph.get_node_property('ID')
    address_property = graph.get_node_property('ADDRESS')
    with open('/tmp/' + 'address_length_error.txt', 'w') as f:
        for node in graph.master_nodes():
            if len(str(address_property[node])) < 5 and str(id_property[node]).startswith('a_') :
                f.write(str(id_property[node])+ ' has very short address:' + str(address_property[node]) + '\n' )
    return output
# graph.run(lambda g: dist(g))
out = graph.run_result(lambda g: check_address_length(g, 'PERSON', ['name']))

          0/? [?op/s]

In [144]:
def check_number_of_addresses(graph, nodeLabel, listOfAttributes):
    from katana.local_native import Node, OutEdgeHandle
    from katana_enterprise.distributed import single_host
    id_property = graph.get_node_property('ID')
    with open('/tmp/' + 'number_of_address_error.txt', 'w') as f:
        for node in graph.master_nodes():
            print('do nothing')
            edges = graph.out_edge_ids(Node(node))
            address_count = 0
            for edge_id in edges:
                destination_node = graph.out_edge_dst(edge_id).value
                if str(id_property[destination_node]).startswith('a_'):
                    address_count = address_count + 1
            if address_count > 2:
                f.write( str(id_property[node]) + ' has too many addresses' + '\n' )
                # f.write(str(id_property[node]) + ' has too many addresses:'+ str(len(edges))+'\n')
    return single_host('')
# graph.run(lambda g: dist(g))
out = graph.run_result(lambda g: check_number_of_addresses(g, 'PERSON', ['name']))

          0/? [?op/s]

In [146]:
out.value

''

In [147]:
def write_number_of_addresses_feature(graph, nodeLabel, listOfAttributes):
    from katana.local_native import Node, OutEdgeHandle
    from katana_enterprise.distributed import single_host
    id_property = graph.get_node_property('ID')
    with open('/tmp/' + 'customer_feature_number_of_address.txt', 'w') as f:
        for node in graph.master_nodes():
            print('do nothing')
            edges = graph.out_edge_ids(Node(node))
            address_count = 0
            for edge_id in edges:
                destination_node = graph.out_edge_dst(edge_id).value
                if str(id_property[destination_node]).startswith('a_'):
                    address_count = address_count + 1
            if str(id_property[node]).startswith('c_'):
                f.write( str(id_property[node]) + ',' + str(address_count) +'\n' )
                # f.write(str(id_property[node]) + ' has too many addresses:'+ str(len(edges))+'\n')
    return single_host('ok')
# graph.run(lambda g: dist(g))
out = graph.run_result(lambda g: write_number_of_addresses_feature(g, 'PERSON', ['name']))

          0/? [?op/s]

In [148]:
out.value

'ok'

In [149]:
def write_valid_single_address_for_customer(graph, nodeLabel, listOfAttributes):
    from katana.local_native import Node, OutEdgeHandle
    from katana_enterprise.distributed import single_host
    id_property = graph.get_node_property('ID')
    address_property = graph.get_node_property('ADDRESS')
    with open('/tmp/' + 'write_valid_single_address_for_customer.txt', 'w') as f:
        for node in graph.master_nodes():
            print('do nothing')
            edges = graph.out_edge_ids(Node(node))
            address_count = 0
            valid_address = ''
            for edge_id in edges:
                destination_node = graph.out_edge_dst(edge_id).value
                if str(id_property[destination_node]).startswith('a_'):
                    if len(str(address_property[destination_node])) > 5:
                        valid_address = str(address_property[destination_node])
            if str(id_property[node]).startswith('c_'):
                f.write( str(id_property[node]) + ',' + valid_address +'\n' )
                # f.write(str(id_property[node]) + ' has too many addresses:'+ str(len(edges))+'\n')
    return single_host('ok')
# graph.run(lambda g: dist(g))
out = graph.run_result(lambda g: write_valid_single_address_for_customer(g, 'PERSON', ['name']))

          0/? [?op/s]

In [150]:
out.value

'ok'

In [48]:
graph.schema().view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [80]:
def count_parallel(graph, nodeLabel, listOfAttributes):
    import threading
    output=[]
    id_property = graph.get_node_property('ID')
    chunked_data = []
    node_chunk = []
    for node in graph.master_nodes():
        node_chunk.append(node)
        if len(node_chunk) > 200 :
            chunked_data.append(node_chunk)
            node_chunk = []

    
    chunked_data.append(node_chunk)
    

    
    def do_some_work(chunk):
        with open('/tmp/' + 'logs.txt', 'a') as f:
            f.write(str(len(chunk)) + '\n')
    
    t1 = threading.Thread(target=do_some_work, args=(chunked_data[0],))
    t2 = threading.Thread(target=do_some_work, args=(chunked_data[3],))
    
    # starting thread 1
    t1.start()
    # starting thread 2
    t2.start()
  
    # wait until thread 1 is completely executed
    t1.join()
    # wait until thread 2 is completely executed
    t2.join()
    
    
    
    return output
# graph.run(lambda g: dist(g))
out = graph.run_result(lambda g: count_parallel(g, 'PERSON', ['name']))

          0/? [?op/s]

In [78]:
print(out.value)

ValueError: Hosts returned more than one value. Please have only one host return a value while other's return a None.
The returned results (index is the host # in list):
[[], []]

In [9]:
from katana.remote.analytics  import louvain_clustering
louvain_clustering(graph, 'lv')

          0/? [?op/s]

In [10]:
qry = """
Match (x)
where exists(x.lv)
return x limit 5
"""
result = graph.query(qry)
result.view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [23]:
qry = """
Match (x)
where tointeger(x.lv) = 5
return x 
"""
result = graph.query(qry, contextualize = True)
result.view()

          0/? [?op/s]

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [13]:
qry = """
Match (c:CUSTOMER)-[]-(a:ADDRESS)
where c.ID = 'c_1'
return c,a
"""
result = graph.query(qry, contextualize = True)
result.view()

          0/? [?op/s]

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [14]:
from katana.remote.analytics import cdlp
cdlp(graph, 'gl')

          0/? [?op/s]

In [20]:
qry = """
Match (x)
where exists(x.cc)
return x limit 5
"""
result = graph.query(qry)
result.view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [18]:
from katana.remote.analytics import connected_components
connected_components(graph, 'cc')

          0/? [?op/s]