In [52]:
import bz2
import os
import re

import mwparserfromhell
import numpy as np
import pandas as pd

import findspark
findspark.init('/usr/lib/spark2')

from pyspark.sql import SparkSession

In [3]:
spark = (
    SparkSession.builder
    .appName('Pyspark notebook (isaacj -- pagerank)')
    .master('yarn')
    .config(
        'spark.driver.extraJavaOptions',
        ' '.join('-D{}={}'.format(k, v) for k, v in {
            'http.proxyHost': 'webproxy.eqiad.wmnet',
            'http.proxyPort': '8080',
            'https.proxyHost': 'webproxy.eqiad.wmnet',
            'https.proxyPort': '8080',
        }.items()))
    .config('spark.jars.packages', 'graphframes:graphframes:0.6.0-spark2.3-s_2.11')
    .config("spark.driver.memory", "4g")
    .config('spark.dynamicAllocation.maxExecutors', 128)
    .config("spark.executor.memory", "8g")
    .config("spark.executor.cores", 4)
    .config("spark.sql.shuffle.partitions", 512)
    .getOrCreate()
)
spark

In [5]:
snapshot = '2020-07'  # data will be current to this date -- e.g., 2020-07 means data is up to 30 June 2020 (at least)
wiki = 'enwiki'  # wikidb you want to run pagerank for

In [31]:
def getLinks(wikitext):
    """Extract list of links from wikitext for an article."""
    try:
        wt = mwparserfromhell.parse(wikitext)
        return [str(l.title).partition('#')[0].replace(' ', '_') for l in wt.filter_wikilinks()]
    except Exception:
        return None
    
spark.udf.register('getLinks', getLinks, 'Array<String>')

<function __main__.getLinks(wikitext)>

In [32]:
"""
Explanation of CTEs:
* title_to_id: mapping of page title to page ID, which is a more stable identifier
* redirects: resolve redirects so the network is much denser (~6M nodes instead of ~11M nodes)
* pagelinks: extract links from wikitext and explode so each row has one link
* pagelinks_reformatted: map link page titles to link page IDs
* final: resolve redirects and rename columns to match pagerank library expectations
"""

print_for_hive = False
do_execute = True

query = """
WITH title_to_id AS (
    SELECT page_id,
           page_title
      FROM wmf_raw.mediawiki_page
     WHERE snapshot = '{0}'
           AND wiki_db = '{1}'
           AND page_namespace = 0
),
redirects AS (
    SELECT mr.rd_from AS rd_from,
           tti.page_id AS rd_to
      FROM wmf_raw.mediawiki_redirect mr
     INNER JOIN title_to_id tti
           ON (mr.rd_title = tti.page_title)
     WHERE mr.snapshot = '{0}'
           AND mr.wiki_db = '{1}'
           AND mr.rd_namespace = 0
),
pagelinks AS (
    SELECT wt.page_id AS pl_from,
           explode(getLinks(wt.revision_text)) AS pl_title_to
      FROM wmf.mediawiki_wikitext_current wt
      LEFT ANTI JOIN redirects r
           ON (wt.page_id = r.rd_from)
     WHERE wt.snapshot = '{0}'
           AND wt.wiki_db = '{1}'
           AND wt.page_namespace = 0
),
pagelinks_reformatted AS (
    SELECT pl.pl_from AS pl_from,
           tti.page_id AS pl_to
      FROM pagelinks pl
     INNER JOIN title_to_id tti
           ON (pl.pl_title_to = tti.page_title)
)
    SELECT DISTINCT pl.pl_from AS src,
           COALESCE(r.rd_to, pl.pl_to) AS dst
      FROM pagelinks_reformatted pl
      LEFT JOIN redirects r
           ON (pl.pl_to = r.rd_from)
""".format(snapshot, wiki)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    src_dst = spark.sql(query)
    src_dst.createOrReplaceTempView("src_dst")


WITH title_to_id AS (
    SELECT page_id,
           page_title
      FROM wmf_raw.mediawiki_page
     WHERE snapshot = '2020-07'
           AND wiki_db = 'enwiki'
           AND page_namespace = 0
),
redirects AS (
    SELECT mr.rd_from AS rd_from,
           tti.page_id AS rd_to
      FROM wmf_raw.mediawiki_redirect mr
     INNER JOIN title_to_id tti
           ON (mr.rd_title = tti.page_title)
     WHERE mr.snapshot = '2020-07'
           AND mr.wiki_db = 'enwiki'
           AND mr.rd_namespace = 0
),
pageLinks AS (
    SELECT wt.page_id AS pl_from,
           explode(getLinks(wt.revision_text)) AS pl_title_to
      FROM wmf.mediawiki_wikitext_current wt
      LEFT ANTI JOIN redirects r
           ON (wt.page_id = r.rd_from)
     WHERE wt.snapshot = '2020-07'
           AND wt.wiki_db = 'enwiki'
           AND wt.page_namespace = 0
),
pagelinks_reformatted AS (
    SELECT pl.pl_from AS pl_from,
           tti.page_id AS pl_to
      FROM pageLinks pl
     INNER JOIN title_to_id tti


In [33]:
src_dst.show(n=10)

+--------+----+
|     src| dst|
+--------+----+
|  532476|2510|
| 7007430|3052|
|11826583|3052|
|  386140|3052|
| 4370492|6466|
|45534636|6466|
|22307781|6466|
|38939808|6466|
|28583057|6466|
| 9277635|6466|
+--------+----+
only showing top 10 rows



In [34]:
# en.wikipedia.org/wiki/?curid=21349232
spark.sql('SELECT * FROM src_dst WHERE src = 21349232').show(100, False)

+--------+--------+
|src     |dst     |
+--------+--------+
|21349232|76508   |
|21349232|1037934 |
|21349232|17450530|
|21349232|76762   |
|21349232|44361   |
|21349232|2789889 |
|21349232|19599929|
|21349232|52710   |
|21349232|45773   |
|21349232|12304303|
|21349232|17851278|
|21349232|30864733|
|21349232|12319285|
|21349232|17766207|
+--------+--------+



In [35]:
"""
Explanation of CTEs:
* all_pageids: get set of all page IDs that show up in src or dst columns
* pageid_to_title: gather titles for each page ID because its easier to interpret
* final: join the two together
"""

print_for_hive = False
do_execute = True

query = """
WITH all_pageids AS (
    SELECT DISTINCT(page_id)
      FROM (
        SELECT src as page_id
          FROM src_dst
         UNION ALL
        SELECT dst as page_id
          FROM src_dst
          ) p
),
pageid_to_title AS (
    SELECT page_id,
           page_title
      FROM wmf_raw.mediawiki_page mp
     WHERE snapshot = '{0}'
           AND wiki_db = '{1}'
           AND page_namespace = 0
)
SELECT p.page_id as id,
       t.page_title as page_title
  FROM all_pageids p
  LEFT JOIN pageid_to_title t
            ON (p.page_id = t.page_id)
""".format(snapshot, wiki)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    nodes = spark.sql(query)
    nodes.createOrReplaceTempView("nodes")


WITH all_pageids AS (
    SELECT DISTINCT(page_id)
      FROM (
        SELECT src as page_id
          FROM src_dst
         UNION ALL
        SELECT dst as page_id
          FROM src_dst
          ) p
),
pageid_to_title AS (
    SELECT page_id,
           page_title
      FROM wmf_raw.mediawiki_page mp
     WHERE snapshot = '2020-07'
           AND wiki_db = 'enwiki'
           AND page_namespace = 0
)
SELECT p.page_id as id,
       t.page_title as page_title
  FROM all_pageids p
  LEFT JOIN pageid_to_title t
            ON (p.page_id = t.page_id)



In [36]:
nodes.show(n=10)

+-----+--------------------+
|   id|          page_title|
+-----+--------------------+
| 1365|             Ammonia|
| 1990|            August_5|
| 2835|        Afghan_Hound|
| 2851|Abraham_Joshua_He...|
| 7312|          Chauvinism|
| 9762|  Ecumenical_council|
| 9890|   Electron_counting|
|10696|Military_of_the_F...|
|14392|            Howitzer|
|15392| Imperial_Conference|
+-----+--------------------+
only showing top 10 rows



## Run PageRank

In [None]:
from graphframes import *

In [37]:
## create graph object
g = GraphFrame(nodes, src_dst)

In [38]:
g.inDegrees.show(n=10)

+--------+--------+
|      id|inDegree|
+--------+--------+
| 5659330|     299|
|18915364|      45|
|  262135|     193|
|   37299|   11652|
|21224559|     223|
|  273285|   37410|
|  770909|    2021|
|30928502|      32|
|35106871|    1509|
| 5610601|       5|
+--------+--------+
only showing top 10 rows



In [39]:
# See: https://graphframes.github.io/graphframes/docs/_site/api/python/graphframes.html#graphframes.GraphFrame.pageRank
# Hyperparameters:
#   - resetProbability (inverse of damping factor: https://en.wikipedia.org/wiki/PageRank#Damping_factor)
#     - most sources suggest it should be 0.15
#   - maxIter is set to 40 here as that is the parameter used in: https://www.aifb.kit.edu/images/e/e5/Wikipedia_pagerank1.pdf
#     - you could also set the tolerance to 0.01 but I don't know how long that takes to converge for enwiki
# This shouldn't take more than 20-30 minutes for English Wikipedia
# There will be k jobs you can track at https://yarn.wikimedia.org/cluster/scheduler where k is the number of iterations
pr = g.pageRank(resetProbability=0.15, maxIter=40)
result = pr.vertices.sort('pagerank', ascending=False)
result.createOrReplaceTempView('pagerank')

In [40]:
# write pagerank results to TSV
query = """
SELECT pr.id as page_id,
       pr.pagerank as pagerank,
       n.page_title as page_title
  FROM pagerank pr
  LEFT JOIN nodes n
       ON (pr.id = n.id)
"""
results = spark.sql(query)
# this will write to 512 bzipped TSVs -- they can be easily compiled into 1 via Python or just use .coalesce(1) here
# to pull onto stat machines: stat100x$ hdfs dfs -copyToLocal /user/isaacj/pagerank-enwiki/part* .
results.write.csv(path="/user/isaacj/pagerank-{0}".format(wiki), compression="bzip2", header=True, sep="\t")


In [45]:
!hdfs dfs -copyToLocal pagerank-enwiki/part* file_parts/

In [49]:
file_parts_dir = './file_parts/'
fns = [fn for fn in os.listdir(file_parts_dir) if fn.endswith('.csv.bz2')]
history_combined = 'enwiki_pagerank_notemplates.tsv'
print_every = 1
history_length = {}
skipped = 0
processed = 0
output_header = ['page_id', 'pagerank', 'page_title']
with open(history_combined, 'w') as fout:
    fout.write('\t'.join(output_header) + '\n')
    for i, fn in enumerate(fns, start=1):
        with bz2.open(os.path.join(file_parts_dir, fn), 'rt') as fin:
            # the quote symbol " is somehow a valid username character...
            header = next(fin).strip().split('\t')
            assert header == output_header
            for line_no, line_str in enumerate(fin, start=1):
                line = line_str.strip().split('\t')
                assert len(line) == len(output_header)
                pid = line[0]
                pagerank = line[1]
                page_title = line[2]
                try:
                    int(pid)
                except ValueError:
                    print("PID:", line_str)
                    skipped += 1
                    continue
                try:
                    float(pagerank)
                except ValueError:
                    print("PR:", line_str)
                    skipped += 1
                    continue
                processed += 1
                fout.write(line_str)
        if i % print_every == 0:
            print("{0} / {1} files processed.".format(i, len(fns)))
            print_every = print_every * 2
print("{0} pages processed. {1} skipped.".format(processed, skipped))

1 / 512 files processed.
2 / 512 files processed.
4 / 512 files processed.
8 / 512 files processed.
16 / 512 files processed.
32 / 512 files processed.
64 / 512 files processed.
128 / 512 files processed.
256 / 512 files processed.
512 / 512 files processed.
6134966 pages processed. 0 skipped.


In [55]:
df = pd.read_csv('./enwiki_pagerank_notemplates.tsv', sep='\t')
df.sort_values('pagerank', ascending=False).head(50).set_index('page_title')['pagerank']

page_title
United_States                           12560.054117
World_War_II                             6399.805946
France                                   6016.280689
The_New_York_Times                       5477.629028
United_Kingdom                           5381.480503
List_of_sovereign_states                 5326.901882
Germany                                  5153.956441
India                                    4560.663176
New_York_City                            4386.221642
London                                   3828.579171
Catholic_Church                          3675.257020
Russia                                   3648.675568
Italy                                    3620.371527
English_language                         3530.305263
Australia                                3498.592060
Canada                                   3478.763155
World_War_I                              3444.544972
Japan                                    3417.589380
China                              