In [3]:
cd ..

/Users/amiyaguchi/wikipedia-retention


In [4]:
from src.data.snap_import_user_projection import UnimodalUserProjection
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder.getOrCreate()

input_path = "data/processed/enwiki-meta-compact"
model = UnimodalUserProjection(spark).extract(input_path).transform()

In [5]:
spark.table("bipartite").cache()
spark.table("bipartite").count()

53595946

In [6]:
role_df = spark.read.csv(
    "data/processed/rolx-roles",
    schema="user_id INT, role_id INT",
    sep='\t',
    comment='-', 
    ignoreLeadingWhiteSpace=True
)
role_df.show(n=5)

+-------+-------+
|user_id|role_id|
+-------+-------+
|  44750|      1|
|2118749|      1|
|  84417|      1|
| 921428|      1|
| 282514|      1|
+-------+-------+
only showing top 5 rows



In [7]:
admin_df = spark.read.csv(
    "data/processed/admin_mapping.csv",
    schema="user_id INT, username STRING"
)

In [8]:
role_admin = (
    role_df.join(admin_df, on="user_id", how='left')
    .selectExpr(
        "user_id", 
        "role_id", 
        "cast(username is not null as int) as is_admin"
    )
)

role_admin.show(n=5)

+-------+-------+--------+
|user_id|role_id|is_admin|
+-------+-------+--------+
|  44750|      1|       0|
|2118749|      1|       0|
|  84417|      1|       0|
| 921428|      1|       0|
| 282514|      1|       0|
+-------+-------+--------+
only showing top 5 rows



In [9]:
sorted(role_admin.select("role_id").distinct().collect())[:5]

[Row(role_id=0),
 Row(role_id=1),
 Row(role_id=2),
 Row(role_id=3),
 Row(role_id=4)]

In [30]:
(
    role_admin
    .groupBy("role_id")
    .agg(
        F.expr("count(distinct user_id) as n_users"),
        F.expr("sum(is_admin) as n_admins")
    )
    .orderBy(F.desc("n_users"))
).show()

+-------+-------+--------+
|role_id|n_users|n_admins|
+-------+-------+--------+
|      1|  57823|     712|
|     15|  55728|       2|
|     21|  51549|       0|
|     14|  51364|       2|
|      6|  49790|       3|
|     24|  47307|       0|
|     26|  44357|       6|
|      3|  40123|       0|
|     23|  39089|       0|
|      0|  37421|       2|
|     18|  35279|       1|
|      9|  33855|       0|
|     10|  32043|       1|
|     20|  31615|       2|
|     29|  29690|       4|
|     25|  28259|       0|
|     17|  27643|       4|
|     27|  26358|       3|
|     19|  23922|       0|
|      7|  23751|       0|
+-------+-------+--------+
only showing top 20 rows



In [118]:
import numpy as np
from pyspark.ml.feature import VectorAssembler

# article_roles = (
#     spark.table("bipartite").limit(1000)
#     .join(role_admin, on="user_id", how="left")
#     .na.fill({"role_id": -1, "is_admin": 0})
#     # count roles per block
#     .groupby("article_id", "edit_date")
#     .pivot("role_id")
#     .agg(F.expr("count(distinct user_id) as n_users"))
#     .fillna(0)
# )


article_roles = (
    spark.table("bipartite").limit(1000)
    .join(role_admin, on="user_id", how="left")
    .na.fill({"role_id": -1, "is_admin": 0})
    # count roles per block
    .groupby("article_id", "edit_date", "role_id")
    .agg(F.expr("count(distinct user_id) as n_users"))
)

article_total = (
    spark.table("bipartite")
    .groupby("article_id", "edit_date")
    .agg(F.expr("count(distinct user_id) as total_users"))
)

normalized_article_roles = (
    article_roles
    .join(article_total, on=["article_id", "edit_date"])
    .withColumn("n_users", F.expr("n_users/total_users"))
)

+----------+----------+---+---+---+---+---+---+---+
|article_id| edit_date| -1|  0|  1| 14| 18| 26|  x|
+----------+----------+---+---+---+---+---+---+---+
|     32208|2002-12-14|  0|  0|  1|  0|  0|  0| 58|
|    235429|2004-09-10|  0|  0|  1|  0|  0|  0| 58|
|     63681|2002-01-20|  0|  0|  0|  1|  0|  0| 58|
|   1943938|2007-02-27|  0|  0|  1|  0|  0|  0| 58|
|     53348|2004-06-08|  0|  0|  1|  0|  0|  0| 58|
|  11133659|2007-12-08|  0|  0|  1|  0|  0|  0| 58|
|     34648|2003-05-17|  0|  0|  1|  0|  0|  0| 58|
|   1073920|2006-09-10|  0|  0|  1|  0|  0|  0| 58|
|  14610731|2003-05-29|  0|  0|  1|  0|  0|  0| 58|
|   4535938|2006-05-15|  0|  0|  1|  0|  0|  0| 58|
|    618672|2005-09-16|  0|  0|  1|  0|  0|  0| 58|
|    260914|2003-10-10|  0|  0|  1|  0|  0|  0| 58|
|    593926|2005-09-19|  0|  0|  1|  0|  0|  0| 58|
|     43507|2003-08-29|  0|  0|  1|  0|  0|  0| 58|
|    347833|2004-07-04|  0|  0|  1|  0|  0|  0| 58|
|     58893|2003-03-29|  0|  0|  1|  0|  0|  0| 58|
|    842430|

In [15]:
role_df.selectExpr("min(role_id)", "max(role_id)").show()

+------------+------------+
|min(role_id)|max(role_id)|
+------------+------------+
|           0|          30|
+------------+------------+



In [90]:
import numpy as np

bipartite = (
    spark.table("bipartite")
    .join(role_admin, on="user_id", how="left")
    .na.fill({"role_id": -1, "is_admin": 0})
)

article_roles = (
    bipartite
    .groupBy("article_id", "edit_date", "role_id")
    .agg(F.expr("count(user_id) as n_users"))
)


def seq_func(acc, data):
    acc[data[0]] += data[1]
    return acc

def comb_func(v1, v2): 
    return v1 + v2

n_roles = 31  # max+1 of the dataframe role_id
article_role_vec = (
    article_roles
    .rdd
    .map(lambda r: ((r.article_id, r.edit_date), (r.role_id, r.n_users)))
    .aggregateByKey(
        np.zeros(n_roles),
        seq_func,
        comb_func
    )
)

# this is ugly, use a pandas df instead
features = (
    bipartite
    .select("user_id", "article_id", "edit_date")
    .rdd
    .map(lambda r: ((r.article_id, r.edit_date), r.user_id))
    .join(article_role_vec)
    .map(lambda r: (r[1][0], r[1][1]))
    .map(lambda r: (r[0], (r[1]/np.sum(r[1]), 1)))
    .reduceByKey(lambda v1, v2: (v1[0] + v2[0], v1[1] + v2[1]))
    .map(lambda r: (r[0], *(r[1][0]/r[1][1]).tolist()))
).toDF()

features.repartition(4).write.parquet("data/processed/avg_role_features")