In [1]:
cd ..

/home/amiyaguchi/cs224w/wikipedia-retention


In [17]:
! ls data/processed/

2007-1-enwiki-projection-user.csv	 kcore-2007-1.csv
2007-1-enwiki-projection-user-roles.csv  louvain_level5_modularity.tsv
2007-1-user-network-v3.csv		 markov_bounds.csv
2007-2-enwiki-projection-user.csv	 role-features
2007-2-enwiki-projection-user-roles.csv  role-nmf-G-v1.csv
2007-3-enwiki-projection-user.csv	 role-nmf-v1.csv
2007-3-enwiki-projection-user-roles.csv  roles
2007-4-enwiki-projection-user.csv	 rolx_article_distribution
2007-4-enwiki-projection-user-roles.csv  rolx-roles
aa_full_headers.csv			 rolx-v
admin_mapping.csv			 user-network-v3
all_article_features.csv		 user-network-v3.csv
all_user_features.csv			 user-network-v3-mappings
base_features_reg.csv			 user-network-v3-v
community_norm_features.csv		 uu_full.csv
enwiki-meta-compact


In [2]:
from pyspark.sql import SparkSession, functions as F, types as T

spark = SparkSession.builder.getOrCreate()
enmain = spark.read.parquet("data/interim/enwiki-meta-main")
enmain.createOrReplaceTempView("enmain")

In [16]:
from pyspark.sql.window import Window

previous_revision = Window.partitionBy("article_id").orderBy("rev_id")

def transform(enmain):
    res = (
        enmain
        .select(
            "user_id",
            "username",
            "article_id",
            "article_title",
            "textdata",
            "rev_id",
            "timestamp",
            (F.size("main") - F.lag(F.size("main")).over(previous_revision)).alias("link_diff"),
            F.array_except(F.lag("main").over(previous_revision), "main").alias("link_remove"),
            F.array_except("main", F.lag("main").over(previous_revision)).alias("link_add")
        )
        .orderBy("article_title", "rev_id")
    )
    return res

rev_diffs = transform(enmain)
rev_diffs.createOrReplaceTempView("rev_diffs")

In [20]:
rev_diffs.write.parquet("data/processed/enwiki-meta-main-compact/v1", mode="overwrite")

In [21]:
rev_diffs = spark.read.parquet("data/processed/enwiki-meta-main-compact/v1")

In [22]:
%time rev_diffs.count()

CPU times: user 2.37 ms, sys: 2.26 ms, total: 4.63 ms
Wall time: 543 ms


116590880

In [24]:
link_from = rev_diffs.withColumn("src", F.explode("link_add"))
link_to = rev_diffs.withColumn("dst", F.explode("link_remove"))

link_from.createOrReplaceTempView("link_from")
link_to.createOrReplaceTempView("link_to")

edges = spark.sql("""
-- why doesnt left outer join work?

SELECT
    s.user_id,
    s.username,
    s.article_id,
    s.rev_id,
    s.timestamp as added_ts,
    t.timestamp as removed_ts,
    t.rev_id - s.rev_id as span_rev,
    (unix_timestamp(t.timestamp) - unix_timestamp(s.timestamp)) as span_seconds,
    s.article_title as src,
    t.dst
FROM link_from AS s
JOIN link_to AS t
ON s.article_title = t.article_title
    AND s.src = t.dst
    AND s.rev_id < t.rev_id
    AND s.timestamp < t.timestamp

UNION

SELECT
    s.user_id,
    s.username,
    s.article_id,
    s.rev_id,
    s.timestamp as added_ts,
    null as removed_ts,
    null as span_rev,
    null as span_seconds,
    s.article_title as src,
    null as dst
FROM link_from AS s
LEFT OUTER JOIN link_to AS t
ON s.article_title = t.article_title
    AND t.dst is null
    AND s.rev_id < t.rev_id
    AND s.timestamp < t.timestamp
""")

edges.orderBy("src", "rev_id").write.parquet("data/processed/rev_history/v1", mode="overwrite")

In [25]:
edges = spark.read.parquet("data/processed/rev_history/v1")
edges.show()

+-------+--------------+----------+--------+-------------------+-------------------+--------+------------+--------------------+--------------------+
|user_id|      username|article_id|  rev_id|           added_ts|         removed_ts|span_rev|span_seconds|                 src|                 dst|
+-------+--------------+----------+--------+-------------------+-------------------+--------+------------+--------------------+--------------------+
|1328606|        Krm500|    166570|92570903|2006-12-06 15:28:33|2006-12-08 08:42:49|  380237|      148456|List_of_tallest_s...|          Gothenburg|
|1328606|        Krm500|    166570|92570903|2006-12-06 15:28:33|               null|    null|        null|List_of_tallest_s...|                null|
|1382933|           PKT|    166570|92924884|2006-12-08 06:07:28|2006-12-08 08:42:49|   26256|        9321|List_of_tallest_s...|    Toronto,_Ontario|
|1382933|           PKT|    166570|92924884|2006-12-08 06:07:28|               null|    null|        null|

In [None]:
%time edges.count()

In [None]:
edges.where("to is null").count()