In [2]:
from py2neo import Graph

In [3]:
graph = Graph(auth=('neo4j', 'shekhor'))

In [4]:
graph

Graph('bolt://neo4j@localhost:7687')

In [5]:
phrase = "myers battery backup sump pump manual"

In [6]:
def construct_query(phrase, min_n_keywords = 3, n_common_urls = 4):
    """
    Iteratively build cliques of min_n_keywords where all edges have more than n_common_urls
    """
    # Find keyword
    match_phrase = 'MATCH (k1:Keyword {{keyword: "{phrase}"}})-[r1:ASSOCIATED_WITH]-(k2:Keyword)\n'\
                   'WHERE size(r1.urls) > {n_common_urls}\n'\
                   'WITH *\n'\
                    .format(phrase=phrase, n_common_urls=n_common_urls)
    find_next_keyword = ''
    for n in range(2,min_n_keywords):
        # Find k_n+1: k_n's neighbors that share at least n_common_urls urls with k_1, k_2, ..., k_n
        keywords_up_to_n = ', '.join(['k'+str(i) for i in range(1,n+1)])
        relationships_up_to_n = ', '.join(['r'+str(i) for i in range(1,n+1)])
        find_next_keyword += 'MATCH (k{n})-[r{n}:ASSOCIATED_WITH]-(k{nplus1})\n'\
                             'WHERE NOT k{nplus1} in [{keywords_up_to_n}]\n'\
                             'WITH *,reduce(intersect = r1.urls, r IN [{relationships_up_to_n}] | apoc.coll.intersection(intersect, r.urls)) AS commonUrls\n'\
                             'WHERE size(commonUrls) > 4\n'\
                             .format(n=n,nplus1=n+1,keywords_up_to_n=keywords_up_to_n,relationships_up_to_n=relationships_up_to_n,n_common_urls=n_common_urls)
    return_keywords = ', '.join(['k'+str(i) for i in range(1,min_n_keywords+1)])
    return_query = 'RETURN [k in [{return_keywords}] | k.keyword] AS group, commonUrls\n'\
                    .format(return_keywords=return_keywords)
    return match_phrase + find_next_keyword + return_query

In [8]:
%%time
foo = graph.run(construct_query(phrase, min_n_keywords = 5)).to_table()

CPU times: user 16.4 s, sys: 155 ms, total: 16.6 s
Wall time: 16.7 s


In [9]:
len(foo)

329136

In [10]:
foo[0]

(['myers battery backup sump pump manual',
  'rain barrel sump pump discharge',
  'sump pump use basement',
  'deko sump pump manual',
  'sump pump on switch'],
 [21219, 27267, 4357, 27162, 3085])

In [11]:
%%time
foo = graph.run(construct_query(phrase, min_n_keywords = 6)).to_table()

CPU times: user 5min 28s, sys: 3.01 s, total: 5min 31s
Wall time: 5min 33s
