In [1]:
%load_ext autoreload
%autoreload 2

In [228]:
import json
import random
import numpy as np

from interfaces.db import DB

from utils.progress_bar import ProgressBar
from utils.text.io import save_json

In [135]:
db = DB()

In [23]:
query = """
SELECT DISTINCT SourcePageID AS PageID FROM piper_wikipedia.Page_Links_Random_Walk
UNION
SELECT DISTINCT TargetPageID AS PageID FROM piper_wikipedia.Page_Links_Random_Walk
"""

page_ids = db.execute_query(query)
page_ids = [page_id for (page_id,) in page_ids]

In [139]:
query = f"""
SELECT SourcePageID, TargetPageID
FROM piper_wikipedia.Page_Links_Random_Walk
"""

adjacencies = db.execute_query(query)

In [149]:
successors = {s: [] for s in page_ids}

for s, t in adjacencies:
    successors[s].append(t)

predecessors = {t: [] for t in page_ids}

for s, t in adjacencies:
    predecessors[t].append(s)

In [150]:
def get_random_pairs(n=100):
    sources = random.sample(page_ids, n)
    targets = random.sample(page_ids, n)

    return list(zip(sources, targets))

In [151]:
pairs = get_random_pairs()

In [193]:
def B(u, n):
    if n == 0:
        return {u}
    
    if n == 1:
        return {u} | set(successors[u])
    
    if n == -1:
        return {u} | set(predecessors[u])
    
    if n >= 2:
        B_prev = B(u, n - 1)
        S_prev = B_prev - B(u, n - 2)
        
        ball = B_prev
        for v in S_prev:
            ball |= B(v, 1)
        
        return ball
    
    if n <= -2:
        B_prev = B(u, n + 1)
        S_prev = B_prev - B(u, n + 2)
        
        ball = B_prev
        for v in S_prev:
            ball |= B(v, -1)

        return ball

def S(u, n):
    if n == 0:
        return {u}
    
    if n >= 1:
        return B(u, n) - B(u, n - 1)
    
    if n <= -1:
        return B(u, n) - B(u, n + 1)
    
    
def min_dist(s, t):    
    n = 0
    s_out_prev = set()
    t_in_prev = set()
    s_stable = False
    t_stable = False
    while True:
        if not s_stable:
            s_out = B(s, n)

            if s_out == s_out_prev:
                s_stable = True
            else:
                s_out_prev = s_out

        if not t_stable:
            t_in = B(t, -n)

            if t_in == t_in_prev:
                t_stable = True
            else:
                t_in_prev = t_in
        
        if s_out & t_in:
            return n
        
        if s_stable or t_stable:
            return None

        n += 1

In [170]:
min_dists = []

pb = ProgressBar(len(pairs))
for s, t in pairs:
    min_dists.append(min_dist(s, t))
    pb.update()

[##################################################] 100.00%


In [174]:
np.mean([d for d in min_dists if d is not None])

2.463917525773196

In [164]:
are_6_connected = []

pb = ProgressBar(len(pairs))
for s, t in pairs:
    are_6_connected.append(len(B(s, 3) & B(t, -3)) > 0)
    pb.update()

[##################################################] 100.00%


In [165]:
are_6_connected.count(True), are_6_connected.count(False)

(96, 4)

In [166]:
sources = []
sinks = []
isolated = []

for u in successors:
    if not successors[u]:
        sinks.append(u)
        
    if not predecessors[u]:
        sources.append(u)
    
    if not successors[u] and not predecessors[u]:
        isolated.append(u)

In [167]:
len(successors), len(predecessors), len(sources), len(sinks), len(isolated)

(864699, 864699, 37261, 8668, 0)

In [230]:
from definitions import DATA_DIR

save_json(successors, f'{DATA_DIR}/db_predecessors.json')
save_json(predecessors, f'{DATA_DIR}/db_predecessors.json')