In [1]:
import os
os.environ['MODIN_ENGINE']='python'

from katana.remote import Client, import_data, analytics
from katana.example_data import get_rdg_dataset

c = Client()
test_graph = '/source/external/katana/external/test-datasets/rdg_datasets/ldbc_003/storage_format_version_6'
remote_graph = c.create_graph(num_partitions=2)
import_data.rdg(remote_graph, test_graph)

VERBOSE: /source/external/katana/libgalois/include/katana/AsyncPool.h:152: 1440: async pool tids: 139798240261888, 139798238160640, 139798236059392, 139798233958144, 139798231856896, 139798229755648, 139798227654400, 139798225553152, 139798223451904, 139798221350656, 139798219249408, 139798217148160, 139798215046912, 139798212945664
VERBOSE: /source/external/katana/libgalois/include/katana/AsyncPool.h:152: 1440: async pool tids: 139798210844416, 139798206646016, 139798202447616, 139798198249216, 139798194050816, 139798189852416, 139798185654016, 139798181455616
VERBOSE: /source/external/katana/libgalois/include/katana/AsyncPool.h:152: 1440: async pool tids: 139798177257216, 139798168864512, 139798160471808, 139798152079104
VERBOSE: /source/external/katana/libgalois/include/katana/AsyncPool.h:152: 1440: async pool tids: 139798143686400, 139798126905088
VERBOSE: /source/external/katana/libgalois/include/katana/AsyncPool.h:152: 1440: async pool tids: 139798110123776, 139798108022528


          0/? [?op/s]

In [2]:
from katana.distributed.graph_api2 import Graph, InEdge, OutEdge, Node, NodeView, OutEdgeView, InEdgeView
from katana.distributed.graph_api2 import KeyedColumnNode, KeyedColumnEdge
import numpy.random
import numpy as np

# Some examples of Distributed API2

The examples are executed uning the remote API `graph.run` method (`run_api2` for the transition period).
The demo code then runs on a `dev` "cluster" using 2 partitions.
This code is using LDBC003.

May the demo Gods be kind to us.

**Feel free to ask questions.**

In [3]:
def graph_api2_city_count(graph: Graph):
    # Get all the Cities
    city_sequence: Sequence[Node] = graph.nodes(labels="City")  # Old API1: not available
    # You can call nodes with restrictions (as above) or without nodes() to get all nodes.
    print("City Node Count", len(city_sequence))

    # graph.nodes is an entity view on nodes. It has many of the same methods as 
    # graph.in_/out_edges which are views on edges. EntityViews also provide access
    # to property and topology data.
    
remote_graph.run_api2(graph_api2_city_count)

          0/? [?op/s]


Host 0 output:
City Node Count 1343

Host 1 output:
City Node Count 45


In [4]:
def graph_api2_cities(graph: Graph):
    city_sequence: Sequence[Node] = graph.nodes(labels="City")

    # How do I find all the in edges for my cities?
    distinct_src_labels = set()
    
    for city in city_sequence:
        city: Node
        for in_edge in graph.in_edges(city):
            in_edge: InEdge
            src_node: Node = graph.in_edges.src(in_edge)  # Old API1: graph.in_edge_src(in_edge)
            distinct_src_labels.add(graph.nodes.labels(src_node))
            
    print(distinct_src_labels)
    
remote_graph.run_api2(graph_api2_cities)

          0/? [?op/s]


Host 0 output:
{Organisation & University}

Host 1 output:
{Person}


In [5]:
def graph_api2_global(graph: Graph):
    city_sequence: Sequence[Node] = graph.nodes(labels="City")
    
    if city_sequence:
        city, *_ = city_sequence
        # Get the global out degree of a city.
        print("out_degree:", graph.nodes.global_out_degree(city))  # Old API1: graph.global_in_degree(city)
        # Also available for in-edges.

            
remote_graph.run_api2(graph_api2_global)

          0/? [?op/s]


Host 0 output:
out_degree: 1

Host 1 output:
out_degree: 0


In [6]:
def graph_api2_countries(graph: Graph):
    # What countries are in the graph?
    country_sequence: Sequence[Node] = graph.nodes(labels="Country")
    name_prop = graph.nodes.get_property("name")  # Old API1: graph.get_node_property("name")
    print("Countries:", set(name_prop[country_sequence]))  # Same as old API1.

remote_graph.run_api2(graph_api2_countries)

          0/? [?op/s]


Host 0 output:
Countries: {'Estonia', 'Romania', 'Poland', 'Moldova', 'El_Salvador', 'Papua_New_Guinea', 'Greece', 'Denmark', 'Hong_Kong', 'Puerto_Rico', 'Spain', 'Honduras', 'Algeria', 'Colombia', 'Czech_Republic', 'Swaziland', 'Russia', 'Tajikistan', 'Australia', 'Italy', 'Morocco', 'Lithuania', 'Austria', 'Portugal', 'Argentina', 'India', 'United_Kingdom', 'Peru', 'South_Africa', 'New_Zealand', 'Wales', 'Malaysia', 'Tunisia', 'Afghanistan', 'Hungary', 'Vietnam', 'Senegal', 'Turkey', 'Kazakhstan', 'Cameroon', 'Slovakia', 'Jordan', 'Yemen', 'Pakistan', 'Finland', 'Israel', 'Indonesia', 'Laos', 'Taiwan', 'Mauritania', 'Nicaragua', 'Tanzania', 'Burma', 'Canada', 'Libya', 'Singapore', 'Belarus', 'Bosnia_and_Herzegovina', 'Egypt', 'Bolivia', 'Mauritius', 'Chad', 'Panama', 'Namibia', 'Japan', 'Mexico', 'Rwanda', 'France', 'South_Korea', 'Kenya', 'Germany', 'Madagascar', 'Angola', 'Thailand', 'Bulgaria', 'Belgium', 'Sri_Lanka', 'Switzerland', 'Scotland', 'Niger', 'Latvia', 'Republic_of_Mac

In [7]:
def graph_api2_cities_in_countries(graph: Graph):
    country_sequence: Sequence[Node] = graph.nodes(labels="Country")
    name_prop = graph.nodes.get_property("name")  # Old API1: graph.get_node_property("name")

    # What cities are located in each country?
    city_sequence: Sequence[Node] = graph.nodes(labels="City")
    country_cities = {}

    is_part_of_label = graph.out_edges.label_manager["IS_PART_OF"]
    
    for city in city_sequence:
        # iterate the out-edges of city
        for out_edge in graph.out_edges(node=city): # labels=is_part_of_label
            # Check if out_edge has the label IS_PART_OF
            e_label = graph.out_edges.labels(out_edge)
            if e_label == is_part_of_label:
                country_node: Node = graph.out_edges.dst(edge=out_edge)
                # Add the city name to the set for the appropriate country
                country_cities.setdefault(name_prop[country_node], set()).add(name_prop[city])
    
    print("Number of Countries:", len(country_cities))
    if "United_States" in country_cities:
        print("\nCities in United States:")
        print(country_cities["United_States"])


remote_graph.run_api2(graph_api2_cities_in_countries)

          0/? [?op/s]


Host 0 output:
Number of Countries: 111

Cities in United States:
{'San_Antonio', 'Ann_Arbor', 'Gainesville', 'Notre_Damena', 'Cleveland', 'New_York_City', 'Columbia', 'Indianapolis', 'Atlanta', 'Minneapolis', 'Baltimore', 'Nashville', 'San_Diego', 'Cambridge', 'Philadelphia', 'Washington', 'Phoenix', 'Tallahassee', 'Houston', 'New_York', 'Berkeley', 'Fayetteville', 'New_Orleans', 'St._Louis', 'Boston', 'Lubbock', 'Cincinnati', 'Columbus', 'Milwaukee', 'Austin', 'San_Francisco', 'Richmond', 'Chicago', 'Portland', 'Jacksonville', 'Orlando', 'New_Haven', 'Los_Angeles', 'Ithaca', 'Pittsburgh'}

Host 1 output:
Number of Countries: 0


In [8]:
def graph_api2_chrome(graph: Graph):    
    browser_prop = graph.nodes.get_property("browserUsed")
    message_sequence = graph.nodes(labels="Message")
    
    # Find me all the message nodes that used chrome
    chrome_messages = set(m for m in message_sequence if browser_prop[m] == "Chrome")
    print("Number of messages on chrome: ", len(chrome_messages))

remote_graph.run_api2(graph_api2_chrome)

          0/? [?op/s]


Host 0 output:
Number of messages on chrome:  0

Host 1 output:
Number of messages on chrome:  1159


In [9]:
def graph_api2_is_chrome_property(graph: Graph):    
    browser_prop = graph.nodes.get_property("browserUsed")
    message_sequence = graph.nodes(labels="Message")
    chrome_messages = set(m for m in message_sequence if browser_prop[m] == "Chrome")
    
    keys_sequence = browser_prop.keys()  # Series of nodes
    # Create new property data array.
    is_chrome_data = np.zeros(len(browser_prop), dtype=bool)
    for index, key in enumerate(keys_sequence):
        if key in chrome_messages:
            is_chrome_data[index] = True
    # This is messy, it will get better for GA with the addition of UpdateableKeyedColomns
    
    is_chrome = KeyedColumnNode(is_chrome_data, browser_prop, "is_chrome")  
    # This operation is bad. We will fix it too.
    graph.nodes.upsert_property(is_chrome)  
    #   old API1: graph.add_node_property(is_chrome=is_chrome_data)
    # add_property will only create the property if it doesn't exist.
    
    # Currently adding or updating a node property must have values for EVERY 
    # node, including mirrors. Soon, we will provide a method to set the 
    # correct values for mirrors.
    # This is not a problem if mirrors nodes are only used to access the 
    # original data that was imported.

remote_graph.run_api2(graph_api2_is_chrome_property)

# Are you sure the is_chrome property stayed on the graph?
remote_graph.query("MATCH (a:Message) WHERE a.is_chrome RETURN a.browserUsed, a.is_chrome;")

          0/? [?op/s]

          0/? [?op/s]

Unnamed: 0,a.browserUsed,a.is_chrome
0,Chrome,True
1,Chrome,True
2,Chrome,True
3,Chrome,True
4,Chrome,True
...,...,...
1154,Chrome,True
1155,Chrome,True
1156,Chrome,True
1157,Chrome,True


In [10]:
def graph_api2_both_property_fun(graph: Graph):    
    def init_feature_property(view, name):
        keys = view.get_property_keys()
        data = np.random.rand(len(keys), 4)  # The second dimension can be any length
        view.upsert_property(view.KeyedColumn.steal_array(data, keys, name))
    
    print("Use with edges:")
    init_feature_property(graph.out_edges, "edge_feature")
    edge_feature = graph.out_edges.get_property("edge_feature")
    print(len(edge_feature))
    print(edge_feature.to_numpy())
    
    print("\nUse with nodes:")
    init_feature_property(graph.nodes, "node_feature")
    node_feature = graph.nodes.get_property("node_feature")
    print(len(node_feature))
    print(node_feature.to_numpy())
    
    print("\nGet a single vector for a node:")
    n, *_ = graph.nodes()
    print(node_feature[n])

remote_graph.run_api2(graph_api2_both_property_fun)

          0/? [?op/s]


Host 0 output:
Use with edges:
21537
[[0.46853607 0.94392769 0.07607089 0.06903818]
 [0.65052083 0.66083361 0.09412256 0.18538935]
 [0.49366764 0.38965967 0.14354563 0.26691357]
 ...
 [0.94491187 0.04155583 0.77853188 0.161085  ]
 [0.97553909 0.46749192 0.54291921 0.62727866]
 [0.17501276 0.98116426 0.26909804 0.31590733]]

Use with nodes:
21544
[[0.65571602 0.47619039 0.10935389 0.14019325]
 [0.19789881 0.06233738 0.22322068 0.16083035]
 [0.35886235 0.27626297 0.37642493 0.23769945]
 ...
 [0.58183161 0.87046574 0.21998683 0.23706556]
 [0.01409559 0.35764551 0.47429415 0.05167617]
 [0.85728161 0.41827269 0.1735386  0.61027267]]

Get a single vector for a node:
[0.65571602 0.47619039 0.10935389 0.14019325]

Host 1 output:
Use with edges:
21535
[[0.53971644 0.95857578 0.63250519 0.67233496]
 [0.62356725 0.02790739 0.37716918 0.69783615]
 [0.73537238 0.38061137 0.05049038 0.29092173]
 ...
 [0.30974486 0.96467745 0.93795646 0.60828998]
 [0.61380667 0.8322237  0.68639292 0.64238705]
 [0.61

You can also do other things that are not done in this demo (or are but implicitly, so you may not have noticed).

* remove properties
* graph properties (as opposed to node or edge properties)
* nodes, edges, and labels are hashable (and can be used as dict keys or the values in sets)

and probably other things I have forgotten

# Addendum

Using distributed reducers.


In [11]:
from katana.distributed import ReduceMax, ReduceMin, ReduceSum, single_host

def graph_api2_chrome_reduce(graph: Graph):    
    # first we need a work around for limitations of LabelSet (there is no way to get sets of size > 1)
    sample_country, *_ = graph.nodes(labels="Country")
    country_label = graph.nodes.labels(sample_country)
    
    # Now we can count the number of masters with the label country
    # We cannot do graph.nodes.masters(labels="Country") since that is not supported, yet (and may not actually be implemented in C++ yet.
    n_countries = sum(1 for n in graph.nodes.masters() if graph.nodes.labels(n) == country_label)
        
    # Print out our host local result
    print("Owned country nodes:", n_countries)
    
    # Now, reduce and return the values
    reducer = ReduceSum[int]()
    reducer.update(n_countries)
    return single_host(reducer.reduce())

print("\nReduced result:", remote_graph.run_api2(graph_api2_chrome_reduce))

          0/? [?op/s]


Host 0 output:
Owned country nodes: 111

Host 1 output:
Owned country nodes: 0

Reduced result: 111


I could not find an example that had values > 0 on both hosts. The reducers do not support vector values directly. However it is possible to create a list of reducers one for each element in the vector (so on the order of 32).

The available reducers are sum, max, and min. They work on most types we use: 8/16/32/64-bit int and 32/64-bit floats and I think boolean.