In [1]:
cd ..

/home/amiyaguchi/cs224w/wikipedia-retention


In [32]:
!head data/processed/markov_bounds.csv

k,p_one,p_any
2 0.99000000 0.99498744
3 0.90000000 0.94216829
4 0.78455653 0.86410869
5 0.68377223 0.78831332
6 0.60189283 0.72155904
7 0.53584112 0.66416306
8 0.48205253 0.61492493
9 0.43765867 0.57247457
10 0.40051575 0.53560803


In [6]:
from src.data.snap_import_user_projection import UnimodalUserProjection
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder.getOrCreate()

input_path = "data/processed/enwiki-meta-compact"
model = UnimodalUserProjection(spark).extract(input_path).transform()

In [82]:
block_list = spark.sql("""
with block_list as (
    select
        article_id,
        concat(year(edit_date), '-', quarter(edit_date)) as edit_date,
        collect_set(user_id) as user_set
    from bipartite
    group by 1,2
)
select 
    article_id,
    edit_date,
    size(user_set) as n_users,
    user_set
from block_list
""")

block_list.cache()
%time block_list.count()

CPU times: user 12.2 ms, sys: 7.06 ms, total: 19.3 ms
Wall time: 1min 40s


13713183

In [83]:
block_list.printSchema()

root
 |-- article_id: integer (nullable = true)
 |-- edit_quarter: string (nullable = true)
 |-- n_users: integer (nullable = false)
 |-- user_set: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [84]:
block_list.selectExpr("max(size(user_set))").show()

+-------------------+
|max(size(user_set))|
+-------------------+
|               1732|
+-------------------+



In [85]:
# calculate markov bounds for cliques of size 1-n based on variables
from scipy.optimize import fsolve
import numpy as np

n = 1732
epsilon = 0.01

# loop over all n using previous value as seed
any_bound = {}
all_bound = {}
p_one = 1
p_all = 1
for k in range(2,n+1):
    func_one = lambda p: ((1-p) ** (k-1)) / epsilon - 1
    func_any = lambda p: (1 - ((1- ((1-p) ** (k-1))) ** k)) / epsilon - 1
    p_one = fsolve(func_one,p_one)[0]
    p_all = fsolve(func_any,p_all)[0]
    any_bound[k] = p_one
    all_bound[k] = p_all

In [92]:
@F.udf(T.ArrayType(T.ArrayType(T.IntegerType())))
def all_edges(user_set):
    return list(combinations(sorted(user_set), 2))

block_list.selectExpr("n_users*(n_users-1)/2 as n_edges").selectExpr("sum(n_edges)").show()

+------------+
|sum(n_edges)|
+------------+
|2.82368293E8|
+------------+



In [94]:
from pyspark.sql import types as T
from itertools import combinations
from random import random

bounds = any_bound

@F.udf(T.ArrayType(T.ArrayType(T.IntegerType())))
def sample_edges(user_set):
    k = len(user_set)
    if k < 2:
        return []
    p = bounds[k]
    edges = [c for c in combinations(sorted(user_set), 2) if random() < p]
    return edges

edgelist = (
    block_list
    .select(F.explode(sample_edges("user_set")).alias("edges"))
    .select(F.col("edges").getItem(0).alias("e1"), F.col("edges").getItem(1).alias("e2"))
    .groupby("e1", "e2")
    .agg(F.expr("count(*) as weight"))
)

edgelist.repartition(1).write.parquet("data/processed/user-network-v3")

In [95]:
edgelist = spark.read.parquet("data/processed/user-network-v3")
edgelist.show()

+-------+-------+------+
|     e1|     e2|weight|
+-------+-------+------+
| 571653| 587577|     1|
| 422965|1467795|     2|
| 612852|1725149|     7|
|   8029|  29856|    19|
|1218374|3447299|     1|
|  35314|  57658|     8|
| 271058|2128469|     3|
| 743015| 899701|    11|
| 109883| 234523|     1|
|1215485|2407864|     1|
|  14010|2336102|     1|
| 801279|1538132|     1|
| 959742|1251026|     2|
| 293907|1122589|     1|
| 304736| 603177|     2|
|2090843|2654847|     4|
| 521374| 642738|     1|
|  28107|  87543|     4|
|  81704|1587622|     1|
| 266416|1575512|    49|
+-------+-------+------+
only showing top 20 rows

