In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("../data/processed/enwiki-meta-compact")
df.createOrReplaceTempView("enwiki")

In [39]:
def get_edges(period):
    query = """
    with subset as (
        SELECT
            concat(year, '-', quarter) as period,
            article_id,
            cast(user_id as int) as user_id,
            textdata
        FROM
            enwiki
    ),
    
    -- remove all nodes where the degree is < 2
    degree as (
        SELECT
            user_id,
            count(distinct article_id) as degree
        FROM subset
        GROUP BY 1
    )
    
    -- collect the weighted edge-list
    SELECT
        subset.user_id,
        article_id,
        sum(textdata) as word_count,
        count(*) as num_edits 
    FROM
        subset
    INNER JOIN degree ON subset.user_id = degree.user_id
    WHERE
        degree > 1 AND
        subset.user_id is not null AND
        period = '{}'
    GROUP BY 1, 2
    """.format(period)

    res = spark.sql(query)
    return res

get_edges('2002-1').show(n=5)

+-------+----------+----------+---------+
|user_id|article_id|word_count|num_edits|
+-------+----------+----------+---------+
|    148|     37238|        58|        1|
|    148|      3390|      2489|        1|
|    148|     37009|        47|        1|
|    148|     37010|       235|        1|
|    148|     36816|       148|        1|
+-------+----------+----------+---------+
only showing top 5 rows



In [40]:
edges = get_edges('2007-1')
edges.cache()
edges.createOrReplaceTempView("edges")
print(edges.count())
edges.show(n=5)

4565574
+-------+----------+----------+---------+
|user_id|article_id|word_count|num_edits|
+-------+----------+----------+---------+
|    148|     48819|      9274|        2|
|    148|   1507500|       269|        1|
|   1591|     38301|     14288|        1|
|   1591|   1381764|      2744|        1|
|   1591|   7346623|        80|        1|
+-------+----------+----------+---------+
only showing top 5 rows



In [41]:
edges.select("user_id").distinct().count()

225895

In [42]:
edges.where("num_edits > 1").select("user_id").distinct().count()

144544

In [43]:
edges.select("article_id").distinct().count()

1285417

Remove all nodes with `degree = 1` and where the number of shared articles is greater than 1.

In [55]:
query = """
-- TODO: jaccard index instead of common neighbors
with unimodal_projection as (
    SELECT
        t1.user_id as e1,
        t2.user_id as e2,
        count(*) as shared_articles
    FROM edges t1
    JOIN edges t2 ON t1.article_id = t2.article_id
    GROUP BY 1, 2
)

SELECT e1, e2, shared_articles
FROM unimodal_projection
WHERE shared_articles > 1
"""

unimodal = spark.sql(query)
print(unimodal.count())
unimodal.show()

7723278
+-------+-------+---------------+
|     e1|     e2|shared_articles|
+-------+-------+---------------+
|  33566|3828001|              3|
| 218757| 433328|              6|
|2201405|1257855|              3|
| 387151|  97951|              3|
| 190760|3076305|              4|
|  56299| 190760|              3|
|3128744| 881395|              2|
| 569777|1991207|              3|
|2946902|  37986|             30|
|  69412|3620496|              2|
| 579287|1583318|              4|
|1091760|  69412|              7|
|2690623|1546024|             39|
| 228773|1634354|              4|
|1232340|  28438|              4|
|   2954|3620496|              2|
|1991207|2707468|             17|
|3171782| 222638|              8|
| 222638|1574574|             12|
|1634354|1392310|             16|
+-------+-------+---------------+
only showing top 20 rows



In [56]:
unimodal.orderBy('e1').coalesce(1).write.csv('../data/processed/2007-1-enwiki-user-projection', sep='\t')

In [58]:
! tree ../data/processed/

[38;5;33m../data/processed/[0m
├── [38;5;33m2007-1-enwiki-user-projection[0m
│   ├── part-00000-fd318c59-0e28-4d6c-8d17-91c34aeb93b5-c000.csv
│   └── _SUCCESS
└── [38;5;33menwiki-meta-compact[0m
    ├── part-00000-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00001-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00002-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00003-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00004-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00005-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00006-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00007-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00008-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00009-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    └── _SUCCESS

2 di

In [59]:
! mv ../data/processed/2007-1-enwiki-user-projection/*.csv ../data/processed/2007-1-enwiki-user-projection.csv

In [60]:
! tree ../data/processed/

[38;5;33m../data/processed/[0m
├── [38;5;33m2007-1-enwiki-user-projection[0m
│   └── _SUCCESS
├── 2007-1-enwiki-user-projection.csv
└── [38;5;33menwiki-meta-compact[0m
    ├── part-00000-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00001-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00002-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00003-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00004-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00005-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00006-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00007-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00008-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    ├── part-00009-b9d9476b-cc88-44c4-8b82-f39efb715f54-c000.snappy.parquet
    └── _SUCCESS

2 directories, 13 files


In [61]:
! rm -r ../data/processed/2007-1-enwiki-user-projection/