#  Setup: Display options

In [37]:

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

print("--")


--


In [38]:

#  Used to report timings
#

from timeit import default_timer

print("--")



--


# Setup: Connectivity

In [39]:

from katana import remote
from katana.remote import import_data

my_client = remote.Client()

print(my_client)


<katana.remote.sync_wrappers.Client object at 0x7fafe2a8a0d0>


In [40]:

#  NUM_PARTITIONS  = 3
NUM_PARTITIONS  = 5                  #  Justin has 5
   #
DB_NAME         = "my_db"
GRAPH_NAME      = "my_graph"

print("--")


--


In [41]:

#  CONNECT TO GRAPH

my_graph, *_ = my_client.get_database(name=DB_NAME).find_graphs_by_name(GRAPH_NAME)

print(my_graph)


<_Graph my_graph, 9DhtNcMG4ML2mPjnbNECWbPdgQsa7144z9KsRBr7DuKj, 1>


In [None]:

#  Now that we share servers, get a better list of databases and graphs ..
#

for l_each_d in my_client.databases():
   print("Database Name: %-28s   Id: %s" % (l_each_d.name, l_each_d.database_id))
      #
   for l_each_g in my_client.get_database(name=l_each_d.name).graphs_in_database(): 
      print("   Graph Name: %-28s      Version: %-8d   Id: %s" % (l_each_g.name, l_each_g.version, l_each_g.graph_id))


print(""  )
print("--")


#  Step 00: Diagnostics

In [42]:

#  List all node types with node counts

l_query  = """

   MATCH (n) 
   WITH DISTINCT LABELS(n) AS labels, COUNT(n) AS cnt
   UNWIND labels AS label
   RETURN label, SUM(cnt) AS cnt
   ORDER BY label
   
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))


print()


#  List all edge types with counts

l_query  = """

   MATCH (m)-[r]->(n) 
   WITH DISTINCT TYPE(r) AS type, COUNT(r) AS cnt
   RETURN type, cnt
   ORDER BY type
   
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))


#  Sample output,
#
#     +--------------+-----------+
#     | label        |       cnt |
#     |--------------+-----------|
#     | City         |      1343 |
#     | Comment      | 127297840 |
#     | Company      |      1575 |
#     | Continent    |         6 |
#     | Country      |       111 |
#     | Forum        |   4103391 |
#     | Message      | 184685839 |
#     | Organisation |      7955 |
#     | Person       |    449165 |
#     | Place        |      1460 |
#     | Post         |  57387999 |
#     | Tag          |     16080 |
#     | TagClass     |        71 |
#     | University   |      6380 |
#     +--------------+-----------+
#     
#     +----------------+-----------+
#     | type           |       cnt |
#     |----------------+-----------|
#     | CONTAINER_OF   |  57387999 |
#     | HAS_CREATOR    | 184685839 |
#     | HAS_INTEREST   |  10500064 |
#     | HAS_MEMBER     | 251011681 |
#     | HAS_MODERATOR  |   4103391 |
#     | HAS_TAG        | 255309416 |
#     | HAS_TYPE       |     16080 |
#     | IS_LOCATED_IN  | 185142959 |
#     | IS_PART_OF     |      1454 |
#     | IS_SUBCLASS_OF |        70 |
#     | KNOWS          |  13678716 |
#     | LIKES          | 206684874 |
#     | REPLY_OF       | 127297840 |
#     | STUDY_AT       |    359562 |
#     | WORK_AT        |    977502 |
#     +----------------+-----------+



          0/? [?op/s]

+--------------+-----------+
| label        |       cnt |
|--------------+-----------|
| City         |      1343 |
| Comment      | 127297840 |
| Company      |      1575 |
| Continent    |         6 |
| Country      |       111 |
| Forum        |   4103391 |
| Message      | 184685839 |
| Organisation |      7955 |
| Person       |    449165 |
| Place        |      1460 |
| Post         |  57387999 |
| Tag          |     16080 |
| TagClass     |        71 |
| University   |      6380 |
+--------------+-----------+



          0/? [?op/s]

+----------------+-----------+
| type           |       cnt |
|----------------+-----------|
| CONTAINER_OF   |  57387999 |
| HAS_CREATOR    | 184685839 |
| HAS_INTEREST   |  10500064 |
| HAS_MEMBER     | 251011681 |
| HAS_MODERATOR  |   4103391 |
| HAS_TAG        | 255309416 |
| HAS_TYPE       |     16080 |
| IS_LOCATED_IN  | 185142959 |
| IS_PART_OF     |      1454 |
| IS_SUBCLASS_OF |        70 |
| KNOWS          |  13678716 |
| LIKES          | 206684874 |
| REPLY_OF       | 127297840 |
| STUDY_AT       |    359562 |
| WORK_AT        |    977502 |
+----------------+-----------+


In [43]:

#  Return properties/keys from all vertices

l_query  = """
   MATCH (n)
   RETURN DISTINCT LABELS(n) AS label, KEYS(n) AS my_keys, COUNT(n) AS cnt
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))


#  Return properties/keys from all edges
#  

l_query  = """
   MATCH (n) - [r] -> (m)
   RETURN DISTINCT TYPE(r) AS my_type, KEYS(r) AS my_keys, COUNT(r) AS cnt
   """.format()

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))


#  Sample output,
#
#     +--------------------------------+-----------------------------------------------------------------------------------------------------------------------+-----------+
#     | label                          | my_keys                                                                                                               |       cnt |
#     |--------------------------------+-----------------------------------------------------------------------------------------------------------------------+-----------|
#     | ['Person']                     | ['birthday', 'browserUsed', 'creationDate', 'email', 'firstName', 'gender', 'id', 'lastName', 'locationIP', 'speaks'] |    449165 |
#     | ['Place', 'Continent']         | ['id', 'name', 'url']                                                                                                 |         6 |
#     | ['TagClass']                   | ['id', 'name', 'url']                                                                                                 |        71 |
#     | ['Tag']                        | ['id', 'name', 'url']                                                                                                 |     16080 |
#     | ['Forum']                      | ['creationDate', 'id', 'title']                                                                                       |   4103391 |
#     | ['Comment', 'Message']         | ['browserUsed', 'content', 'creationDate', 'id', 'length', 'locationIP']                                              | 127297840 |
#     | ['City', 'Place']              | ['id', 'name', 'url']                                                                                                 |      1343 |
#     | ['Organisation', 'University'] | ['id', 'name', 'url']                                                                                                 |      6380 |
#     | ['Company', 'Organisation']    | ['id', 'name', 'url']                                                                                                 |      1575 |
#     | ['Message', 'Post']            | ['browserUsed', 'content', 'creationDate', 'id', 'imageFile', 'language', 'length', 'locationIP']                     |  57387999 |
#     | ['Place', 'Country']           | ['id', 'name', 'url']                                                                                                 |       111 |
#     +--------------------------------+-----------------------------------------------------------------------------------------------------------------------+-----------+
#     
#     +----------------+------------------+-----------+
#     | my_type        | my_keys          |       cnt |
#     |----------------+------------------+-----------|
#     | IS_SUBCLASS_OF | []               |        70 |
#     | STUDY_AT       | ['classYear']    |    359562 |
#     | IS_PART_OF     | []               |      1454 |
#     | HAS_TAG        | []               | 255309416 |
#     | LIKES          | ['creationDate'] | 206684874 |
#     | HAS_INTEREST   | []               |  10500064 |
#     | HAS_CREATOR    | []               | 184685839 |
#     | HAS_TYPE       | []               |     16080 |
#     | HAS_MODERATOR  | []               |   4103391 |
#     | KNOWS          | ['creationDate'] |  13678716 |
#     | IS_LOCATED_IN  | []               | 185142959 |
#     | CONTAINER_OF   | []               |  57387999 |
#     | HAS_MEMBER     | ['creationDate'] | 251011681 |
#     | REPLY_OF       | []               | 127297840 |
#     | WORK_AT        | ['workFrom']     |    977502 |
#     +----------------+------------------+-----------+



          0/? [?op/s]

+--------------------------------+-----------------------------------------------------------------------------------------------------------------------+-----------+
| label                          | my_keys                                                                                                               |       cnt |
|--------------------------------+-----------------------------------------------------------------------------------------------------------------------+-----------|
| ['Person']                     | ['birthday', 'browserUsed', 'creationDate', 'email', 'firstName', 'gender', 'id', 'lastName', 'locationIP', 'speaks'] |    449165 |
| ['Place', 'Continent']         | ['id', 'name', 'url']                                                                                                 |         6 |
| ['TagClass']                   | ['id', 'name', 'url']                                                                                                 |        71 

          0/? [?op/s]

+----------------+------------------+-----------+
| my_type        | my_keys          |       cnt |
|----------------+------------------+-----------|
| IS_SUBCLASS_OF | []               |        70 |
| STUDY_AT       | ['classYear']    |    359562 |
| IS_PART_OF     | []               |      1454 |
| HAS_TAG        | []               | 255309416 |
| LIKES          | ['creationDate'] | 206684874 |
| HAS_INTEREST   | []               |  10500064 |
| HAS_CREATOR    | []               | 184685839 |
| HAS_TYPE       | []               |     16080 |
| HAS_MODERATOR  | []               |   4103391 |
| KNOWS          | ['creationDate'] |  13678716 |
| IS_LOCATED_IN  | []               | 185142959 |
| CONTAINER_OF   | []               |  57387999 |
| HAS_MEMBER     | ['creationDate'] | 251011681 |
| REPLY_OF       | []               | 127297840 |
| WORK_AT        | ['workFrom']     |    977502 |
+----------------+------------------+-----------+


#  Step 01: Actual Routines

In [44]:


from katana.remote.analytics import pagerank
from katana.remote.analytics import louvain_clustering
from katana.remote.analytics import betweenness_centrality


#  Generate a unique prop name so we can run this over and over without edit ..
#
l_cntr = 0

print("--")


--


In [45]:

#  Page Rank obviously ..
#

#  Generate a unique prop name so we can run this over and over without edit ..
#
l_cntr += 1
   #
l_pr_propname = "pr_" + str(l_cntr).zfill(4)
l_time            = default_timer()


pagerank(my_graph, result_property_name = l_pr_propname, max_iterations = 2, alpha = 0.15)

print("Time in seconds: %d" % (default_timer() - l_time))
   #
print("--")


#  Sample output,
#
#  SF: 003
#     Time in seconds: 6
#
#  SF: 100
#     Time in seconds: 88
#


          0/? [?op/s]

Time in seconds: 88
--


In [46]:

#  Return Page Rank value ..
#

l_query  = """
   MATCH (n)
   RETURN n, n.page_rank
   ORDER BY n.{0} DESC
   LIMIT 10

   """.format(l_pr_propname)

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))

print("--")


#  Sample output,   SF: 100
#
#     +------------------+------------------------+--------+-------------+------------------+-----------------------------------------+--------------+---------------+
#     |            id(n) | labels(n)              |   n.id | n.name      |        n.pr_0001 | n.url                                   | n datatype   | n.page_rank   |
#     |------------------+------------------------+--------+-------------+------------------+-----------------------------------------+--------------+---------------|
#     |  562949953422903 | ['Place', 'Country']   |      0 | India       |      1.10979e+06 | http://dbpedia.org/resource/India       | node         |               |
#     | 1125899906844254 | ['Place', 'Country']   |      1 | China       |      1.10597e+06 | http://dbpedia.org/resource/China       | node         |               |
#     |  281474976712490 | ['Place', 'Continent'] |   1454 | Asia        | 598063           | http://dbpedia.org/resource/Asia        | node         |               |
#     | 1125899906844528 | ['Place', 'Continent'] |   1456 | Europe      | 223494           | http://dbpedia.org/resource/Europe      | node         |               |
#     |             1652 | ['Place', 'Country']   |     50 | Germany     | 218038           | http://dbpedia.org/resource/Germany     | node         |               |
#     |  844424930133531 | ['Place', 'Country']   |     53 | Mexico      | 214180           | http://dbpedia.org/resource/Mexico      | node         |               |
#     |             1651 | ['Place', 'Country']   |     49 | Brazil      | 213752           | http://dbpedia.org/resource/Brazil      | node         |               |
#     |  562949953422911 | ['Place', 'Country']   |     55 | Philippines | 211883           | http://dbpedia.org/resource/Philippines | node         |               |
#     |  844424930133532 | ['Place', 'Country']   |     54 | Pakistan    | 211776           | http://dbpedia.org/resource/Pakistan    | node         |               |
#     |  281474976712211 | ['Place', 'Country']   |     58 | Vietnam     | 210748           | http://dbpedia.org/resource/Vietnam     | node         |               |
#     +------------------+------------------------+--------+-------------+------------------+-----------------------------------------+--------------+---------------+


          0/? [?op/s]

+------------------+------------------------+--------+-------------+------------------+-----------------------------------------+--------------+---------------+
|            id(n) | labels(n)              |   n.id | n.name      |        n.pr_0001 | n.url                                   | n datatype   | n.page_rank   |
|------------------+------------------------+--------+-------------+------------------+-----------------------------------------+--------------+---------------|
|  562949953422903 | ['Place', 'Country']   |      0 | India       |      1.10979e+06 | http://dbpedia.org/resource/India       | node         |               |
| 1125899906844254 | ['Place', 'Country']   |      1 | China       |      1.10597e+06 | http://dbpedia.org/resource/China       | node         |               |
|  281474976712490 | ['Place', 'Continent'] |   1454 | Asia        | 598063           | http://dbpedia.org/resource/Asia        | node         |               |
| 1125899906844528 | ['Place', 'Co

In [47]:

#  Louvain obviously ..
#

l_nodetype = "Tag"
   #
my_graph_l = my_graph.project(node_types = [ l_nodetype ] )

#  Generate a unique prop name so we can run this over and over without edit ..
#
l_cntr += 1
   #
l_lv_propname = "lv_" + str(l_cntr).zfill(4)
l_time            = default_timer()


louvain_clustering(
    my_graph_l,
    result_property_name = l_lv_propname,
    is_symmetric=True,
)

print("Time in seconds: %d" % (default_timer() - l_time))
   #
print("--")


#  Sample output,
#
#  SF: 003
#     Time in seconds: 7
#
#  SF: 100
#     Time in seconds: 62
#


          0/? [?op/s]

Time in seconds: 62
--


In [48]:

#  Return Louvain value ..
#

l_query  = """
   MATCH (n: {0})
   RETURN n, n.{1}
   ORDER BY n.{1} DESC
   LIMIT 10

   """.format(l_nodetype, l_lv_propname)

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))

print("--")


#  Sample output,   SF: 100
#
#     +------------------+-------------+--------+------------------+-----------------------------------------------------+-------------+---------------------------------------------------------------------------------+--------------+------------------+
#     |            id(n) | labels(n)   |   n.id |        n.lv_0002 | n.name                                              |   n.pr_0001 | n.url                                                                           | n datatype   |        n.lv_0002 |
#     |------------------+-------------+--------+------------------+-----------------------------------------------------+-------------+---------------------------------------------------------------------------------+--------------+------------------|
#     | 1125899906847720 | ['Tag']     |  16077 | 1125899906847720 | Sea_to_Sea:_I_See_the_Cross                         |     1.69947 | http://dbpedia.org/resource/Sea_to_Sea:_I_See_the_Cross                         | node         | 1125899906847720 |
#     | 1125899906847719 | ['Tag']     |  16075 | 1125899906847719 | R&B_Transmogrification                              |     1.46029 | http://dbpedia.org/resource/R&B_Transmogrification                              | node         | 1125899906847719 |
#     | 1125899906847718 | ['Tag']     |  16070 | 1125899906847718 | Borrón_y_Cuenta_Nueva                               |     2.42998 | http://dbpedia.org/resource/Borrón_y_Cuenta_Nueva                               | node         | 1125899906847718 |
#     | 1125899906847717 | ['Tag']     |  16067 | 1125899906847717 | Rokuko                                              |     2.94539 | http://dbpedia.org/resource/Rokuko                                              | node         | 1125899906847717 |
#     | 1125899906847716 | ['Tag']     |  16066 | 1125899906847716 | Harmacy                                             |     1.96812 | http://dbpedia.org/resource/Harmacy                                             | node         | 1125899906847716 |
#     | 1125899906847715 | ['Tag']     |  16063 | 1125899906847715 | Süper_Star                                          |     4.19051 | http://dbpedia.org/resource/Süper_Star                                          | node         | 1125899906847715 |
#     | 1125899906847714 | ['Tag']     |  16062 | 1125899906847714 | Wood/Water                                          |     1.5587  | http://dbpedia.org/resource/Wood/Water                                          | node         | 1125899906847714 |
#     | 1125899906847713 | ['Tag']     |  16059 | 1125899906847713 | RemixeS                                             |    17.6068  | http://dbpedia.org/resource/RemixeS                                             | node         | 1125899906847713 |
#     | 1125899906847712 | ['Tag']     |  16051 | 1125899906847712 | Kaelakee_hääl                                       |     1.64234 | http://dbpedia.org/resource/Kaelakee_hääl                                       | node         | 1125899906847712 |
#     | 1125899906847711 | ['Tag']     |  16045 | 1125899906847711 | Heaven_&_Hell_–_A_Tribute_to_The_Velvet_Underground |     6.44432 | http://dbpedia.org/resource/Heaven_&_Hell_–_A_Tribute_to_The_Velvet_Underground | node         | 1125899906847711 |
#     +------------------+-------------+--------+------------------+-----------------------------------------------------+-------------+---------------------------------------------------------------------------------+--------------+------------------+


          0/? [?op/s]

+------------------+-------------+--------+------------------+-----------------------------------------------------+-------------+---------------------------------------------------------------------------------+--------------+------------------+
|            id(n) | labels(n)   |   n.id |        n.lv_0002 | n.name                                              |   n.pr_0001 | n.url                                                                           | n datatype   |        n.lv_0002 |
|------------------+-------------+--------+------------------+-----------------------------------------------------+-------------+---------------------------------------------------------------------------------+--------------+------------------|
| 1125899906847720 | ['Tag']     |  16077 | 1125899906847720 | Sea_to_Sea:_I_See_the_Cross                         |     1.69947 | http://dbpedia.org/resource/Sea_to_Sea:_I_See_the_Cross                         | node         | 1125899906847720 |
| 1125899906

In [49]:

#  Betweeness Centrality ..
#

#  Generate a unique prop name so we can run this over and over without edit ..
#
l_cntr += 1
   #
l_bc_propname = "bc_" + str(l_cntr).zfill(4)
l_time            = default_timer()


betweenness_centrality(
   my_graph, result_property_name = l_bc_propname, num_sources = 100
   )

print("Time in seconds: %d" % (default_timer() - l_time))
   #
print("--")


#  Sample output,
#
#  SF: 003
#     Time in seconds: 85
#
#  SF: 100
#    Time in seconds: 3063
#



          0/? [?op/s]

Time in seconds: 3063
--


In [None]:

#  Return Betweenness Centrality value ..
#

l_query  = """
   MATCH (n)
   RETURN n, n.{0}
   ORDER BY n.{0} DESC
   LIMIT 10

   """.format(l_bc_propname)

l_result = my_graph.query_unpaginated(l_query)
   #
print(tabulate(l_result, headers = "keys", tablefmt = "psql", showindex = False))

print("--")


#  Sample output,   SF: 100
#



          0/? [?op/s]