In [1]:
cd ..

/home/acmiyaguchi_gmail_com/wikipedia-retention


In [10]:
from src.data.snap_import_user_projection import UnimodalUserProjection
from pyspark.sql import SparkSession, functions as F, types as T

spark = SparkSession.builder.getOrCreate()

input_path = "data/processed/enwiki-meta-compact"
model = UnimodalUserProjection(spark).extract(input_path).transform()

In [4]:
block_list = spark.sql("""
with block_list as (
    select
        user_id,
        collect_set(article_id) as article_set
    from bipartite
    group by 1
)
select 
    user_id,
    size(article_set) as n_articles,
    article_set
from block_list
""")

block_list.cache()
%time block_list.count()

CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 1min 15s


1525755

In [5]:
block_list.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- n_articles: integer (nullable = false)
 |-- article_set: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [6]:
block_list.selectExpr("max(n_articles)").show()

+---------------+
|max(n_articles)|
+---------------+
|         723404|
+---------------+



In [7]:
# calculate markov bounds for cliques of size 1-n based on variables
from scipy.optimize import fsolve
import numpy as np

n = 723404
epsilon = 0.01

# loop over all n using previous value as seed
any_bound = {}
all_bound = {}
p_one = 1
p_all = 1
for k in range(2,n+1):
    func_one = lambda p: ((1-p) ** (k-1)) / epsilon - 1
    func_any = lambda p: (1 - ((1- ((1-p) ** (k-1))) ** k)) / epsilon - 1
    p_one = fsolve(func_one,p_one)[0]
    p_all = fsolve(func_any,p_all)[0]
    any_bound[k] = p_one
    all_bound[k] = p_all

  improvement from the last ten iterations.


In [8]:
any_bound

{2: 0.99,
 3: 0.9,
 4: 0.7845565309968165,
 5: 0.6837722339831622,
 6: 0.6018928294465039,
 7: 0.5358411166387044,
 8: 0.48205253207687876,
 9: 0.4376586748096509,
 10: 0.40051574968105896,
 11: 0.36904265551982696,
 12: 0.3420667753424343,
 13: 0.31870793094203903,
 14: 0.29829617132961733,
 15: 0.280314326998848,
 16: 0.2643577455403587,
 17: 0.2501057906675441,
 18: 0.2373014140976556,
 19: 0.22573631731887295,
 20: 0.21524002964853872,
 21: 0.20567176527571857,
 22: 0.19691427786084856,
 23: 0.18886916921031285,
 24: 0.1814532692931085,
 25: 0.17459581473198776,
 26: 0.16823622889733245,
 27: 0.1623223599317101,
 28: 0.1568090707133754,
 29: 0.15165710175592864,
 30: 0.1468321475827196,
 31: 0.14230410140910613,
 32: 0.138046433524697,
 33: 0.13403567663993476,
 34: 0.13025099738221677,
 35: 0.1266738376171567,
 36: 0.12328761270313177,
 37: 0.120077456430893,
 38: 0.11703000445059106,
 39: 0.11413320958991738,
 40: 0.11137618372565966,
 41: 0.10874906186625444,
 42: 0.106242884894

In [11]:
@F.udf(T.ArrayType(T.ArrayType(T.IntegerType())))
def all_edges(user_set):
    return list(combinations(sorted(user_set), 2))

block_list.selectExpr("n_articles*(n_articles-1)/2 as n_edges").selectExpr("sum(n_edges)").show()

+---------------+
|   sum(n_edges)|
+---------------+
|9.2484011771E10|
+---------------+



In [None]:
from pyspark.sql import types as T
from itertools import combinations
from random import random

bounds = any_bound

@F.udf(T.ArrayType(T.ArrayType(T.IntegerType())))
def sample_edges(user_set):
    k = len(user_set)
    if k < 2:
        return []
    p = bounds[k]
    edges = [c for c in combinations(sorted(user_set), 2) if random() < p]
    return edges

edgelist = (
    block_list
    .select(F.explode(sample_edges("article_set")).alias("edges"))
    .select(F.col("edges").getItem(0).alias("e1"), F.col("edges").getItem(1).alias("e2"))
    .groupby("e1", "e2")
    .agg(F.expr("count(*) as weight"))
)

edgelist.coalesce(10).write.parquet("data/processed/article-network-v3")

In [None]:
edgelist = spark.read.parquet("data/processed/article-network-v3")
edgelist.count()