In [1]:
%load_ext autoreload
%autoreload 2

# Spark Preprocessing

In [1]:
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("../data/processed/enwiki-meta-compact")
df.createOrReplaceTempView("enwiki")

In [2]:
def get_edges_blocked():
    query = """
    with subset as (
        SELECT
            concat(year, '-', quarter) as period,
            dayofyear(timestamp) as doy,
            article_id,
            cast(user_id as int) as user_id,
            textdata
        FROM
            enwiki
    )
    -- collect the weighted edge-list
    SELECT
        user_id,
        article_id,
        doy,
        sum(textdata) as word_count,
        count(*) as num_edits 
    FROM
        subset
    WHERE
        subset.user_id is not null
    GROUP BY 1, 2, 3
    """

    return spark.sql(query)


def project_common_neighbors_by_day(edges, threshold):
    edges.createOrReplaceTempView("edges")
    query = """
    with unimodal_projection as (
        SELECT
            t1.user_id as e1,
            t2.user_id as e2,
            count(*) as shared_articles
        FROM edges t1
        JOIN edges t2 
        ON t1.article_id = t2.article_id AND t1.doy = t2.doy
        WHERE t1.user_id < t2.user_id
        GROUP BY 1, 2
    )

    SELECT e1, e2, shared_articles
    FROM unimodal_projection
    WHERE shared_articles > {}
    """.format(threshold)
    return spark.sql(query)
    

edge_blocked = get_edges_blocked("2007-1")
elist_day = project_common_neighbors_by_day(edge_blocked, 0)
elist_day.cache()
elist_day.count()

1825594

In [13]:
df.selectExpr("date_format(timestamp, 'yyyy-MM-dd')").show()

+----------------------------------+
|date_format(timestamp, yyyy-MM-dd)|
+----------------------------------+
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
|                        2007-07-19|
+----------------------------------+
only showing top 20 rows



In [15]:
query = """
with subset as (
    SELECT
        date_format(timestamp, 'yyyy-MM-dd') as edit_date,
        article_id,
        cast(user_id as int) as user_id,
        textdata
    FROM
        enwiki
),
-- collect the weighted edge-list
bipartite as (
    SELECT
        user_id,
        article_id,
        edit_date,
        sum(log(textdata)) as word_count,
        count(*) as num_edits 
    FROM
        subset
    WHERE
        subset.user_id is not null
    GROUP BY 1, 2, 3
),
unimodal_projection as (
    SELECT
        t1.user_id as e1,
        t2.user_id as e2,
        count(*) as shared_articles
    FROM bipartite t1
    JOIN bipartite t2 
    ON t1.article_id = t2.article_id AND t1.edit_date = t2.edit_date
    WHERE t1.user_id < t2.user_id
    GROUP BY 1, 2
)

SELECT e1, e2, shared_articles
FROM unimodal_projection
"""

spark.sql(query).count()

12632948

In [18]:
query = """
with subset as (
    SELECT
        date_format(timestamp, 'yyyy-MM-dd') as edit_date,
        article_id,
        cast(user_id as int) as user_id,
        textdata
    FROM
        enwiki
),
-- collect the weighted edge-list
bipartite as (
    SELECT
        user_id,
        article_id,
        edit_date,
        sum(log(textdata)) as word_count,
        count(*) as num_edits 
    FROM
        subset
    WHERE
        user_id is not null
    GROUP BY 1, 2, 3
)

select
    article_id,
    edit_date,
    count(distinct user_id)
from bipartite
group by 1, 2
order by 3 desc
"""
spark.sql(query).show()

Py4JJavaError: An error occurred while calling o187.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 10 in stage 30.0 failed 1 times, most recent failure: Lost task 10.0 in stage 30.0 (TID 2415, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.rdd.RDD$$anonfun$takeOrdered$1.apply(RDD.scala:1439)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1426)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:136)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3384)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2759)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:255)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:292)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: Java heap space
