In [1]:
cd ..

/home/amiyaguchi/cs224w/wikipedia-retention


In [2]:
from src.data.snap_import_user_projection import UnimodalUserProjection
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder.getOrCreate()

input_path = "data/processed/enwiki-meta-compact"
model = UnimodalUserProjection(spark).extract(input_path).transform()

In [3]:
spark.table("bipartite").cache()
spark.table("bipartite").count()

53595946

In [4]:
role_df = spark.read.csv(
    "data/processed/rolx-roles",
    schema="user_id INT, role_id INT",
    sep='\t',
    comment='-', 
    ignoreLeadingWhiteSpace=True
)
role_df.show(n=5)

+-------+-------+
|user_id|role_id|
+-------+-------+
|  44750|      1|
|2118749|      1|
|  84417|      1|
| 921428|      1|
| 282514|      1|
+-------+-------+
only showing top 5 rows



In [5]:
role_df.selectExpr("min(role_id)", "max(role_id)").show()

+------------+------------+
|min(role_id)|max(role_id)|
+------------+------------+
|           0|          30|
+------------+------------+



In [6]:
admin_df = spark.read.csv(
    "data/processed/admin_mapping.csv",
    schema="user_id INT, username STRING"
)

In [7]:
role_admin = (
    role_df.join(admin_df, on="user_id", how='left')
    .selectExpr(
        "user_id", 
        "role_id", 
        "cast(username is not null as int) as is_admin"
    )
)

role_admin.show(n=5)

+-------+-------+--------+
|user_id|role_id|is_admin|
+-------+-------+--------+
|  44750|      1|       0|
|2118749|      1|       0|
|  84417|      1|       0|
| 921428|      1|       0|
| 282514|      1|       0|
+-------+-------+--------+
only showing top 5 rows



In [8]:
sorted(role_admin.select("role_id").distinct().collect())[:5]

[Row(role_id=0),
 Row(role_id=1),
 Row(role_id=2),
 Row(role_id=3),
 Row(role_id=4)]

In [9]:
(
    role_admin
    .groupBy("role_id")
    .agg(
        F.expr("count(distinct user_id) as n_users"),
        F.expr("sum(is_admin) as n_admins")
    )
    .orderBy(F.desc("n_users"))
).show(n=5)

+-------+-------+--------+
|role_id|n_users|n_admins|
+-------+-------+--------+
|      1|  57823|     712|
|     15|  55728|       2|
|     21|  51549|       0|
|     14|  51364|       2|
|      6|  49790|       3|
+-------+-------+--------+
only showing top 5 rows



In [10]:
bipartite = (
    spark.table("bipartite")
    .join(role_admin, on="user_id", how="left")
    .na.fill({"role_id": -1, "is_admin": 0})
    .limit(1000)
)

In [11]:
article_roles = (
    bipartite
    .groupby("article_id", "edit_date")
    .pivot("role_id")
    .agg(F.count("user_id").alias("n_users"))
    .fillna(0)
)

totals = (
    bipartite
    .groupby("article_id", "edit_date")
    .agg(F.count("user_id").alias("deg"))
)

normalized = (
    article_roles
    .join(totals, on=["article_id", "edit_date"])
    .select(
        "article_id",
        "edit_date",
        *[
            (F.col(x)/F.col("deg")).alias(f"role_{x}")
            for x in article_roles.columns[2:]
        ]
    )
)

user_roles = (
    bipartite
    .join(normalized, on=["article_id", "edit_date"])
    .groupby("user_id")
    .agg(*[F.sum(x).alias(x) for x in normalized.columns[2:]])
)

# user_roles.repartition(1).write.parquet("data/processed/rolx_article_distribution")

In [12]:
x = user_roles.toPandas()

In [15]:
x.iloc[:, 1:]

Unnamed: 0,role_1,role_6,role_26
0,195.0,0.0,0.0
1,0.0,0.0,266.0
2,0.0,78.0,0.0
3,461.0,0.0,0.0
