## PageRank on Wikipedia

### Prerequisites

1. Download a dump of Wikipedia's articles, named `enwiki-{date_string}-pages-articles-multistream.xml.bz2`
2. Download the `enwiki-{date_string}-pages-articles-multistream-index.txt.bz2` file
3. Move those files into the same folder, removing the `enwiki-{date_string}` prefix
4. Process the `xml.bz2` file into a Parquet file using `wikiplain.load_bz2`

In [1]:
import asyncio
import glob
import gzip
import io
import itertools
import json
import math
import operator
import os
import pickle
import random
import re
import shutil
import socket
import struct
import subprocess
import sys
import tarfile
import time
from collections import ChainMap, defaultdict, deque
from contextlib import asynccontextmanager
from dataclasses import dataclass
from datetime import datetime
from enum import Enum, auto
from functools import lru_cache, partial
from urllib.parse import urlencode, urlsplit, quote as urlquote, unquote as urlunquote
from typing import Any, Awaitable, Callable, Literal, TypeVar

import cbor2
import cytoolz
import numpy as np
import pypocketmap as pkm
import pyarrow.parquet as pq
import polars as pl
import sqlalchemy as sa
import scipy.sparse
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from ipywidgets import interact
from sqlalchemy import create_engine
from sqlalchemy.sql import select, text as sqltext
from tqdm.auto import tqdm
from arsenal.datastructures.unionfind import UnionFind
from arsenal.datastructures.heap import MinMaxHeap

import wikiplain
from wikiplain import Node, NodeKind, Token, TokenKind as TK
from nbhelpers.polars import pager, searcher
from special_cases import SECOND_LEVEL_DOMAINS

In [2]:
load_dotenv()

True

In [3]:
pl.Config.set_fmt_str_lengths(160)

polars.config.Config

In [4]:
class PageRankFiles:
    def __init__(self, date_string):
        self.date_string = date_string
        self.enwiki_dir = f"{os.environ['ENWIKI_DIR']}/{date_string}"
        self.parquet_dir = os.environ.get('ENWIKI_PARQUET_DIR', self.enwiki_dir)
        try:
            os.mkdir(f"{self.enwiki_dir}/pagerank")
        except FileExistsError:
            pass
    
    @property
    def enwiki_parquet_filename(self):
        return f"{self.parquet_dir}/enwiki_{self.date_string}.parquet"
    
    @property
    def pagerank_parquet_filename(self):
        return f"{self.parquet_dir}/enwiki_{self.date_string}_pagerank.parquet"

    @property
    def nub_filename(self):
        return f"{self.enwiki_dir}/pagerank/nub.bin"
    
    @property
    def id_map_filename(self):
        return f"{self.enwiki_dir}/pagerank/id_map.bin"
    
    @property
    def id_map2_filename(self):
        return f"{self.enwiki_dir}/pagerank/id_map2.bin"
    
    @property
    def dense_id_arr_filename(self):
        return f"{self.enwiki_dir}/pagerank/dense_id_arr.npy"
    
    @property
    def edge_filename_pattern(self):
        return f"{self.enwiki_dir}/pagerank/edges_*.npz"
    
    def edge_filenames(self, num_partitions):
        return [
            f"{self.enwiki_dir}/pagerank/edges_{i}.npz"
            for i in range(num_partitions)
        ]

    @property
    def in_degree_filename(self):
        return f"{self.enwiki_dir}/pagerank/in_degree.npy"
    
    @property
    def out_degree_filename(self):
        return f"{self.enwiki_dir}/pagerank/out_degree.npy"
    
    def adjacency_filename(self, partition):
        return f"{self.enwiki_dir}/pagerank/adjacency_{partition}.npz"
    
    def adjacency_filenames(self, num_partitions):
        return [self.adjacency_filename(i) for i in range(num_partitions)]

    @property
    def disambig_arr_filename(self):
        return f"{self.enwiki_dir}/pagerank/disambig_arr.npy"
    
    @property
    def moved_article_set_filename(self):
        return f"{self.enwiki_dir}/pagerank/moved_article_set.bin"
    
    @property
    def top_cite_domains_filename(self):
        return f"{self.enwiki_dir}/pagerank/top_cite_domains.bin"

    @property
    def infobox_title_arr_filename(self):
        return f"{self.enwiki_dir}/pagerank/infobox_titles.bin"

    @property
    def short_description_arr_filename(self):
        return f"{self.enwiki_dir}/pagerank/short_descriptions.bin"

    @property
    def h2_heading_arr_filename(self):
        return f"{self.enwiki_dir}/pagerank/h2_headings.bin"

    @property
    def hatedge_filename_pattern(self):
        return f"{self.enwiki_dir}/pagerank/hatedges_*.npy"
    
    def hatedge_filenames(self, num_partitions):
        return [
            f"{self.enwiki_dir}/pagerank/hatedges_{i}.npy"
            for i in range(num_partitions)
        ]
    
    @property
    def hatcheck_arr_filename(self):
        return f"{self.enwiki_dir}/pagerank/hatcheck_arr.npy"

In [5]:
files = PageRankFiles("20240620")

### Find title collisions

1. There are some pages with the same title - I think this is caused by pages deleted and recreated while the snapshot is in progress

In [6]:
pqf = pq.ParquetFile(files.enwiki_parquet_filename)

In [7]:
def get_overwritten():
    overwritten = set()
    timestamp_map = {}
    article_ids = {}
    pqf_size = 0
    for batch in tqdm(pqf.iter_batches(batch_size=100), total=pqf.num_row_groups):
        for aid, ns, ttl, tm in zip(batch["id"].to_numpy(), batch["ns"].to_numpy(), batch["title"].to_pylist(), batch["timestamp"].to_pylist()):
            pqf_size += 1
            if ns != 0:
                continue
            tm = np.datetime64(tm)
            other_id = article_ids.setdefault(ttl, aid)
            if other_id != aid:
                if (timestamp_map[ttl], other_id) < (tm, aid):
                    print(f"{ttl!r}: {aid} > {other_id}")
                    overwritten.add(other_id)
                    article_ids[ttl] = aid
                    timestamp_map[ttl] = tm
                else:
                    print(f"{ttl!r}: {other_id} > {aid}")
                    overwritten.add(aid)
            else:
                timestamp_map[ttl] = tm
    return overwritten, pqf_size

try:
    with open(files.nub_filename, "rb") as fp:
        overwritten, pqf_size = cbor2.load(fp)
except Exception:
    overwritten, pqf_size = get_overwritten()
    overwritten = {int(e) for e in overwritten}
    with open(files.nub_filename, "wb") as fp:
        cbor2.dump((overwritten, pqf_size), fp)

### Build representation of articles/links as a graph

1. Create `id_map` from non-redirecting article titles to node number, and `id_map2` from redirecting article titles to node number
2. Use `wikiplain` to extract link titles, and use above maps to convert to (src_id, dest_id) pairs

In [8]:
class Vec:
    def __init__(self, dtype):
        self.array = np.ndarray((1024,), dtype=dtype)
        self.length = 0
    
    @property
    def capacity(self):
        return self.array.shape[0]

    def append(self, v):
        idx = self.length
        if idx >= self.capacity:
            addsz = max(2, self.capacity)
            self.array = np.hstack((self.array, np.zeros((addsz,), dtype=self.array.dtype)))
        self.array[idx] = v
        self.length += 1

In [9]:
class PairVec:
    def __init__(self, dtype):
        self.array = np.ndarray((1024, 2), dtype=dtype)
        self.length = 0
    
    @property
    def capacity(self):
        return self.array.shape[0]

    def append(self, v1, v2):
        idx = self.length
        if idx >= self.capacity:
            addsz = max(2, self.capacity)
            self.array = np.vstack((self.array, np.zeros((addsz, 2), dtype=self.array.dtype)))
        self.array[idx] = [v1, v2]
        self.length += 1

In [10]:
def chunk(iterable, size):
    """Split an iterable into list chunks of size `n`.
    
    The last chunk can be fewer than `n` elements long, but it won't be empty.
    """
    iterator = iter(iterable)
    while True:
        chunk = list(itertools.islice(iterator, size))
        if chunk:
            yield chunk
        else:
            return

def lazy_chunk(iterable, n):
    """Split an iterable into iterable chunks of size `n`.
    
    The last chunk can be fewer than `n` elements long, but it won't be empty.
    """
    iterator = iter(iterable)
    while True:
        try:
            first = next(iterator)
        except StopIteration:
            return
        yield itertools.chain([first], itertools.islice(iterator, n - 1))

In [11]:
def iterdecode(f):
    decoder = cbor2.CBORDecoder(f)
    while True:
        try:
            yield decoder.decode()
        except EOFError:
            return

In [12]:
def get_id_maps():
    redirect_group_map = UnionFind()
    id_map = pkm.create(str, int)
    redirect_lst = []
    dense_ids = Vec(dtype=np.int64)
    for batch in tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100)):
        for aid, ns, ttl, redir in zip(batch["id"].to_numpy(), batch["ns"].to_numpy(), batch["title"].to_pylist(), batch["redirect"].to_pylist()):
            if ns != 0 or aid in overwritten:
                continue
            if redir is not None:
                redirect_group_map.union(ttl, redir)
                redirect_lst.append(ttl)
            else:
                assert ttl not in id_map, f"Expected unique titles, got second instance of {ttl}"
                dense_ids.append(aid)
                id_map[ttl] = len(id_map)
    id_map2 = pkm.create(str, int)
    for group in redirect_group_map.classes():
        centers = [ttl for ttl in group if ttl in id_map]
        if len(centers) == 0:
            continue
        assert len(centers) == 1, str(centers)
        for ttl in group:
            if ttl != centers[0]:
                id_map2[ttl] = id_map[centers[0]]
    return id_map, id_map2, dense_ids.array[:dense_ids.length]

try:
    with open(files.id_map_filename, "rb") as fp:
        id_map = pkm.create(str, int)
        for k, v in iterdecode(fp):
            id_map[k] = v
    with open(files.id_map2_filename, "rb") as fp:
        id_map2 = pkm.create(str, int)
        for k, v in iterdecode(fp):
            id_map2[k] = v
    with open(files.dense_id_arr_filename, "rb") as fp:
        dense_id_arr = np.load(fp)
except Exception:
    id_map, id_map2, dense_id_arr = get_id_maps()
    with open(files.id_map_filename, "wb") as fp:
        for k, v in id_map.items():
            cbor2.dump((k, v), fp)
    with open(files.id_map2_filename, "wb") as fp:
        for k, v in id_map2.items():
            cbor2.dump((k, v), fp)
    with open(files.dense_id_arr_filename, "wb") as fp:
        np.save(fp, dense_id_arr)

In [13]:
len(id_map), len(id_map2)

(6839104, 10987529)

In [14]:
list(itertools.islice(iter(id_map.keys()), 10))

['Martín Vázquez',
 'Rouverol',
 'Ron Hansell',
 'Jim Wolf (musician)',
 'McAllen Miller International Airport',
 '1937 Albanian National Championship',
 'Nerka Lake',
 'Manilius (crater)',
 'Metallacarboxylic acid',
 'Nicolas Appert']

In [15]:
list(itertools.islice(iter(id_map2.keys()), 10))

['Office for fair access',
 'The Catalyst (newspaper)',
 'Scarecrow Press historical dictionary series',
 'Boissieri',
 'Edward Northey (disambiguation)',
 'Herzogschloss Zweibrücken',
 'Trinity Square (disambiguation)',
 'Urban cowboy',
 'K 424a',
 'Nawaz Shareef']

In [16]:
# def get_template_id_maps():
#     redirect_group_map = UnionFind()
#     template_id_map = {}
#     for batch in tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100)):
#         for aid, ns, ttl, redir in zip(batch["id"].to_numpy(), batch["ns"].to_numpy(), batch["title"].to_pylist(), batch["redirect"].to_pylist()):
#             if ns != 10 or aid in overwritten:
#                 continue
#             if redir is not None:
#                 redirect_group_map.union(ttl, redir)
#             else:
#                 assert ttl not in template_id_map, f"Expected unique titles, got second instance of {ttl}"
#                 template_id_map[ttl] = aid
#     template_id_map2 = {}
#     for group in redirect_group_map.classes():
#         centers = [ttl for ttl in group if ttl in template_id_map]
#         if len(centers) == 0:
#             continue
#         assert len(centers) == 1, str(centers)
#         for ttl in group:
#             if ttl != centers[0]:
#                 template_id_map2[ttl] = template_id_map[centers[0]]
#     return template_id_map, template_id_map2

# try:
#     with open(files.template_id_maps_filename, "rb") as fp:
#         template_id_map, template_id_map2 = pickle.load(fp)
# except FileNotFoundError:
#     template_id_map, template_id_map2 = get_template_id_maps()
#     with open(files.template_id_maps_filename, "wb") as fp:
#         pickle.dump((template_id_map, template_id_map2), fp)

### Wikipedia article stats

1. Create `disambig_arr`, a simple boolean array recording whether each article is a disambiguation or set-index page.
2. Create `moved_article_set`, a set of article titles which redirect because their article content was moved.
3. Create `top_cite_domains`, the 1024 most commonly cited websites across all articles.
4. Create `infobox_title_arr`, the title of the first infobox on each article.

In [17]:
N = len(id_map)

In [18]:
def get_disambig_arr():
    iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
    iterator = map(
        lambda b: zip(
            b["id"].to_numpy(),
            b["ns"].to_numpy(),
            map(operator.attrgetter("is_valid"), b["redirect"]),
            b["text"].to_pylist()
        ),
        iterator
    )
    iterator = itertools.chain.from_iterable(iterator)
    iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
    iterator = enumerate(map(operator.itemgetter(3), iterator))
    disambig_arr = np.zeros(N, dtype=np.bool_)
    for node_id, text in iterator:
        disambig_arr[node_id] = wikiplain.is_disambiguation_page(text)
    return disambig_arr

try:
    with open(files.disambig_arr_filename, "rb") as fp:
        disambig_arr = np.load(fp)
except Exception:
    disambig_arr = get_disambig_arr()
    with open(files.disambig_arr_filename, "wb") as fp:
        np.save(fp, disambig_arr)

In [19]:
def get_moved_article_set():
    iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
    iterator = map(
        lambda b: zip(
            b["id"].to_numpy(),
            b["ns"].to_numpy(),
            map(operator.attrgetter("is_valid"), b["redirect"]),
            b["title"].to_pylist(),
            map(b["text"].__getitem__, itertools.count()),
        ),
        iterator
    )
    iterator = itertools.chain.from_iterable(iterator)
    iterator = filter(lambda e: e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
    iterator = map(lambda e: (e[3], e[4]), iterator)
    moved_article_set = set()
    for title, text in iterator:
        if '{{R from move' in text.as_py():
            moved_article_set.add(title)
    return moved_article_set

try:
    with open(files.moved_article_set_filename, 'rb') as fp:
        moved_article_set = cbor2.load(fp)
except FileNotFoundError:
    moved_article_set = get_moved_article_set()
    with open(files.moved_article_set_filename, 'wb') as fp:
        cbor2.dump(moved_article_set, fp)

In [20]:
def get_top_cite_domains():
    iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
    iterator = map(
        lambda b: zip(
            b["id"].to_numpy(),
            b["ns"].to_numpy(),
            map(operator.attrgetter("is_valid"), b["redirect"]),
            b["text"].to_pylist()
        ),
        iterator
    )
    iterator = itertools.chain.from_iterable(iterator)
    iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
    iterator = enumerate(map(operator.itemgetter(3), iterator))
    heap = MinMaxHeap()
    heap_phases = 9
    heap_limits = [
        (N * i // heap_phases, 1024 << (heap_phases-i-1))
        for i in range(heap_phases)
    ]
    heap_phase = 0
    heap_limit = heap_limits[heap_phase][1]
    for node_id, text in iterator:
        if heap_phase+1 < heap_phases:
            transition_node_id, lim = heap_limits[heap_phase+1]
            if node_id >= transition_node_id:
                heap_phase += 1
                heap_limit = lim
                while len(heap) > lim:
                    heap.popmin()
        page = defaultdict(int)
        for url in wikiplain.get_cite_urls(text):
            full_domain = re.sub(r"[:/].*", "", url)
            parts = full_domain.split('.')
            if len(parts) >= 2:
                site_domain = parts[-2] + '.' + parts[-1]
                if site_domain in SECOND_LEVEL_DOMAINS:
                    if len(parts) >= 3:
                        site_domain = parts[-3] + '.' + site_domain
                    else:
                        continue
                page[site_domain] += 1
        for k, v in page.items():
            if k in heap:
                heap[k] = heap.max[k] + v
            elif len(heap) < heap_limit:
                heap[k] = v
            elif v > heap.peekmin()[1]:
                heap.popmin()
                heap[k] = v
    top_cite_domains = []
    while len(heap) > 0:
        top_cite_domains.append(heap.popmax())
    return top_cite_domains

try:
    with open(files.top_cite_domains_filename, "rb") as fp:
        top_cite_domains = cbor2.load(fp)
except Exception:
    top_cite_domains = get_top_cite_domains()
    with open(files.top_cite_domains_filename, "wb") as fp:
        cbor2.dump(top_cite_domains, fp)

In [21]:
def get_infobox_title_arr():
    iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
    iterator = map(
        lambda b: zip(
            b["id"].to_numpy(),
            b["ns"].to_numpy(),
            map(operator.attrgetter("is_valid"), b["redirect"]),
            b["text"].to_pylist(),
        ),
        iterator
    )
    iterator = itertools.chain.from_iterable(iterator)
    iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
    iterator = map(operator.itemgetter(3), iterator)
    for text in iterator:
        yield wikiplain.get_first_infobox_title(text)

try:
    with open(files.infobox_title_arr_filename, 'rb') as fp:
        summary = cytoolz.countby(bool, iterdecode(fp))
        print(summary)
        total = summary.get(False, 0) + summary.get(True, 0)
        assert total == N
except (FileNotFoundError, AssertionError):
    with open(files.infobox_title_arr_filename, 'wb') as fp:
        for e in get_infobox_title_arr():
            cbor2.dump(e, fp)

{False: 2556920, True: 4282184}


In [18]:
def split_token(tc, c):
    return [Token(kind=TK.Content, data=piece) for piece in tc.split(c)]

def split_template(line):
    finished = []
    tdepth = 0
    ldepth = 0
    tokens = wikiplain.tokenize(line)
    acc = []
    for tok in tokens:
        match tok:
            case Token(kind=TK.Content, data=tc):
                if tdepth > 0 or ldepth > 0:
                    acc.append(tok)
                elif tc != '':
                    parts = split_token(tc, '|')
                    if parts[0].data != '':
                        acc.append(parts[0])
                    if len(parts) > 1:
                        finished.append(acc)
                        if parts[-1].data != '':
                            acc = [parts.pop()]
                        for p in parts[1:]:
                            finished.append([p])
            case Token(kind=TK.LinkStart):
                ldepth += 1
                acc.append(tok)
            case Token(kind=TK.LinkEnd):
                ldepth -= 1
                acc.append(tok)
            case Token(kind=TK.TemplateStart):
                tdepth += 1
                acc.append(tok)
            case Token(kind=TK.TemplateEnd):
                tdepth -= 1
                acc.append(tok)
    if acc:
        finished.append(acc)
    return finished

def tokenlist_string(tl):
    if len(tl) != 1 or tl[0].kind != TK.Content:
        return None
    return tl[0].data

def token_fmt(tok):
    match tok:
        case Token(kind=TK.Content, data=tc):
            return tc
        case Token(kind=TK.LinkStart):
            return '[['
        case Token(kind=TK.LinkEnd):
            return ']]'
        case Token(kind=TK.TemplateStart):
            return '{{'
        case Token(kind=TK.TemplateEnd):
            return '}}'

def tokenlist_gettext(tl):
    return "".join(map(token_fmt, tl))
    
def tokenlist_startswith(tl, s):
    if len(tl) == 0 or tl[0].kind != TK.Content:
        return False
    return tl[0].data.startswith(s)

def tokenlist_links(tl):
    accs = []
    for tok in tl:
        match tok:
            case Token(kind=TK.Content, data=tc):
                if accs:
                    accs[-1].append(tc)
            case Token(kind=TK.LinkStart):
                accs.append([])
            case Token(kind=TK.LinkEnd):
                if accs:
                    yield "".join(accs.pop())

In [24]:
def parse_short_desc(line):
    lst = split_template(line)
    if len(lst) < 2:
        return None
    desc_toks = lst[1]
    desc = tokenlist_string(desc_toks)
    if desc is None or (len(desc) == 4 and desc.lower() == "none"):
        return None
    # https://en.wikipedia.org/wiki/Template:Escape_template_list
    desc = desc.replace("{{!}}", "|")
    return desc

def get_short_description_arr():
    iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
    iterator = map(
        lambda b: zip(
            b["id"].to_numpy(),
            b["ns"].to_numpy(),
            map(operator.attrgetter("is_valid"), b["redirect"]),
            b["text"].to_pylist(),
        ),
        iterator
    )
    iterator = itertools.chain.from_iterable(iterator)
    iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
    iterator = map(operator.itemgetter(3), iterator)
    for text in iterator:
        yield next(map(parse_short_desc, wikiplain.get_templates_by_name(text, ["Short description"])), None)

try:
    with open(files.short_description_arr_filename, 'rb') as fp:
        summary = cytoolz.countby(bool, iterdecode(fp))
        print(summary)
        total = summary.get(False, 0) + summary.get(True, 0)
        assert total == N
except (FileNotFoundError, AssertionError):
    with open(files.short_description_arr_filename, 'wb') as fp:
        for e in get_short_description_arr():
            cbor2.dump(e, fp)

{True: 3877101, False: 2962003}


In [23]:
def get_h2_heading_arr():
    iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
    iterator = map(
        lambda b: zip(
            b["id"].to_numpy(),
            b["ns"].to_numpy(),
            map(operator.attrgetter("is_valid"), b["redirect"]),
            b["text"].to_pylist(),
        ),
        iterator
    )
    iterator = itertools.chain.from_iterable(iterator)
    iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
    iterator = map(operator.itemgetter(3), iterator)
    for text in iterator:
        result = []
        tokens = wikiplain.tokenize(text)
        i = 0
        while i < len(tokens):
            if tokens[i].kind == TK.ElementStart and tokens[i].data == "h2":
                j = i + 1
                while j < len(tokens) and tokens[j].kind != TK.ElementEnd:
                    j += 1
                if j < len(tokens) and tokens[j].data == "h2":
                    if (htxt := tokenlist_string(tokens[i+1:j])):
                        result.append(htxt)
                i = j + 1
            else:
                i += 1
        yield (result if result else None)

try:
    with open(files.h2_heading_arr_filename, 'rb') as fp:
        summary = cytoolz.countby(bool, iterdecode(fp))
        print(summary)
        total = summary.get(False, 0) + summary.get(True, 0)
        assert total == N
except (FileNotFoundError, AssertionError):
    with open(files.h2_heading_arr_filename, 'wb') as fp:
        for e in get_h2_heading_arr():
            cbor2.dump(e, fp)

{True: 6550703, False: 288401}


#### parse_tokens_dbg

In [21]:
iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
iterator = map(
    lambda b: zip(
        b["id"].to_numpy(),
        b["ns"].to_numpy(),
        map(operator.attrgetter("is_valid"), b["redirect"]),
        b["text"].to_pylist(),
    ),
    iterator
)
iterator = itertools.chain.from_iterable(iterator)
iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
iterator = enumerate(map(operator.itemgetter(3), iterator))
lst = []
for node_id, text in iterator:
    lst.append(text)
    if node_id == 146:
        break
with open("/tmp/wikitext.json", "w") as fp:
    json.dump(lst, fp)

  0%|          | 0/238029 [00:00<?, ?it/s]

In [3]:
with open("/tmp/wikitext.json", "r") as fp:
    lst = json.load(fp)

In [12]:
wtokens = wikiplain.tokenize(lst[-1])

In [13]:
wdoc = wikiplain.parse_tokens(wtokens)

In [6]:
class Lookahead:
    def __init__(self, inner, n):
        self.inner = iter(inner)
        self.bufsize = n
        self.buffer = [None for _ in range(n)]
        self.closed = False
        self.bufhead = 0
        self.head = 0
        
    def peek(self, n):
        if n > self.bufsize:
            raise ValueError()
        return self.buffer[(self.head + n - 1) % self.bufsize]

    def __iter__(self):
        return self
    
    def __next__(self):
        if self.closed:
            if self.head >= self.bufhead:
                raise StopIteration()
            val = self.buffer[self.head % self.bufsize]
            self.buffer[self.head % self.bufsize] = None
            self.head += 1
            return val
        while self.bufhead - self.head < self.bufsize:
            try:
                self.buffer[self.bufhead % self.bufsize] = next(self.inner)
                self.bufhead += 1
            except StopIteration:
                self.buffer[self.bufhead % self.bufsize] = None
                self.closed = True
        val = self.buffer[self.head % self.bufsize]
        self.head += 1
        try:
            self.buffer[self.bufhead % self.bufsize] = next(self.inner)
            self.bufhead += 1
        except StopIteration:
            self.buffer[self.bufhead % self.bufsize] = None
            self.closed = True
        return val

In [7]:
class Event(Enum):
    BEGIN = auto()
    END = auto()

def tree_events(root):
    stack = deque([(Event.END, root), (Event.BEGIN, root)])
    while len(stack) > 0:
        ek, curr = stack.pop()
        yield ek, curr
        if ek == Event.BEGIN:
            for child in reversed(curr.children):
                stack.append((Event.END, child))
                stack.append((Event.BEGIN, child))

In [10]:
name_p = lambda name: lambda elem: hasattr(elem, "name") and name == elem.name
def node_to_document(parsed, mapper):
    stack = deque()
    soup = BeautifulSoup(features="xml")
    curr = soup
    iterator = Lookahead(tree_events(parsed), 4)
    for ek, n in iterator:
        if ek == Event.BEGIN:
            if n.kind == NodeKind.Document:
                nxt = soup.new_tag("document")
                stack.append(curr)
                curr = nxt
            elif n.kind == NodeKind.Element:
                nxt = soup.new_tag(n.data)
                stack.append(curr)
                curr = nxt
            elif n.kind == NodeKind.Template:
                nxt = soup.new_tag("wmt")
                stack.append(curr)
                curr = nxt
                match (iterator.peek(1), iterator.peek(2), iterator.peek(3), iterator.peek(4)):
                    case ((Event.BEGIN, Node(kind=NodeKind.Argument)),
                          (Event.BEGIN, Node(kind=NodeKind.Content, data=title)),
                          (Event.END, Node(kind=NodeKind.Content)),
                          (Event.END, Node(kind=NodeKind.Argument)),
                    ):
                        title = title.strip()
                        curr["title"] = title[:1].lower() + title[1:]
                        for _ in range(4):
                            next(iterator)
            elif n.kind == NodeKind.Link:
                nxt = soup.new_tag("wml")
                stack.append(curr)
                curr = nxt
            elif n.kind == NodeKind.Argument:
                nxt = soup.new_tag("arg")
                stack.append(curr)
                curr = nxt
                match (iterator.peek(1), iterator.peek(2)):
                    case ((Event.BEGIN, Node(kind=NodeKind.Content, data=data)),
                          (Event.END, Node(kind=NodeKind.Content)),
                    ) if data.startswith('|'):
                        m = re.match(r"\|\s*([a-zA-Z][a-zA-Z0-9_]*)\s*=\s*", data)
                        if m is not None:
                            curr["name"] = m.group(1)
                            txt = re.sub(r"[']{2,3}", "", data[m.end():])
                            curr.append(txt)
                        else:
                            txt = re.sub(r"[']{2,3}", "", data[1:])
                            curr.append(txt)
                        for _ in range(2):
                            next(iterator)
            elif n.kind == NodeKind.Content:
                txt = re.sub(r"[']{2,3}", "", n.data)
                curr.append(txt)
        else:
            if n.kind != NodeKind.Content:
                prev = stack.pop()
                for el in mapper(curr):
                    prev.append(el)
                curr = prev
    return soup

In [16]:
wdoc

Document([Template([Argument([Content("Short description")]), Argument([Content("|American rapper")])]), Content("\n"), Template([Argument([Content("Infobox musical artist\n")]), Argument([Content("| name              = Arabian Prince\n")]), Argument([Content("| image             = The_Arabian_Prince_aka_Professor_X_(N.W.A.).jpg\n")]), Argument([Content("| caption           = Arabian Prince in 2018\n")]), Argument([Content("| image_size        = \n")]), Argument([Content("| birth_name        = Kim Renard Nazel\n")]), Argument([Content("| alias             = Professor X\n")]), Argument([Content("| birth_date        = "), Template([Argument([Content("birth date and age")]), Argument([Content("|1965")]), Argument([Content("|6")]), Argument([Content("|17")])]), Content("\n")]), Argument([Content("| birth_place       = "), Link([Argument([Content("Compton, California")])]), Content(", U.S.\n")]), Argument([Content("| instruments       = "), Template([Argument([Content("hlist")]), Argument([

In [20]:
print(node_to_document(wdoc, lambda el: [] if el.name == "ref" else [el]).prettify())

<?xml version="1.0" encoding="utf-8"?>
<document>
 <wmt title="short description">
  <arg>
   American rapper
  </arg>
 </wmt>
 <wmt title="infobox musical artist">
  <arg name="name">
   Arabian Prince
  </arg>
  <arg name="image">
   The_Arabian_Prince_aka_Professor_X_(N.W.A.).jpg
  </arg>
  <arg name="caption">
   Arabian Prince in 2018
  </arg>
  <arg name="image_size">
  </arg>
  <arg name="birth_name">
   Kim Renard Nazel
  </arg>
  <arg name="alias">
   Professor X
  </arg>
  <arg name="birth_date">
   <wmt title="birth date and age">
    <arg>
     1965
    </arg>
    <arg>
     6
    </arg>
    <arg>
     17
    </arg>
   </wmt>
  </arg>
  <arg name="birth_place">
   <wml>
    <arg>
     Compton, California
    </arg>
   </wml>
   , U.S.
  </arg>
  <arg name="instruments">
   <wmt title="hlist">
    <arg>
     Vocals
    </arg>
    <arg>
     synthesizer
    </arg>
    <arg>
     keyboards
    </arg>
    <arg>
     turntables
    </arg>
    <arg>
     drum machine
    </arg>
 

## hatedge_arr

Find links to highly related pages, displayed at the top using one of several Wikitext templates. This is used by RedditRank.ipynb, however it's not feasible to compute it there because opening the enwiki parquet takes too much memory (needed more by other computations in that notebook).

In [25]:
# linking the reader to other articles with similar titles or concepts
# that they may have been seeking instead
HATNOTE_ABOUT = 1
# placed at the top of the article or section that is the primary topic
# of a redirect, and links to other topics that are ambiguous with the
# name of that redirect
HATNOTE_REDIRECT = 2
# concise about / other uses
HATNOTE_FOR = 3
# "not to be confused with"
HATNOTE_DISTINGUISH = 4
def parse_hatnote(line):
    lst = split_template(line)
    if len(lst) == 0:
        return
    tmpl_toks, *args = lst
    tmpl = tokenlist_string(tmpl_toks)
    if not tmpl:
        return
    tmpl = tmpl[0].upper() + tmpl[1:]
    if tmpl == 'About':
        # the Latin letter|the similar Greek letter|Alpha|the similar Cyrillic letter|A (Cyrillic)|other uses
        for dest_ttl in map(tokenlist_gettext, args[2::2]):
            yield HATNOTE_ABOUT, dest_ttl
    elif tmpl == 'Redirect':
        # Achilleus|the Roman usurper with this name|Aurelius Achilleus|other uses|Achilles (disambiguation)
        for dest_ttl in map(tokenlist_gettext, args[2::2]):
            yield HATNOTE_REDIRECT, dest_ttl
    elif tmpl == 'For':
        # the racehorse|Ambiorix (horse)
        for dest_ttl in map(tokenlist_gettext, args[2::2]):
            yield HATNOTE_FOR, dest_ttl
    elif tmpl == 'Redirect-synonym':
        # Wild cranberry|[[Arctostaphylos uva-ursi]]
        for dest_ttl in map(tokenlist_gettext, args[2::2]):
            yield HATNOTE_REDIRECT, dest_ttl
    elif tmpl == 'About-distinguish':
        if len(args) > 1:
            yield HATNOTE_DISTINGUISH, tokenlist_gettext(args[1])
    elif tmpl == 'Redirect2':
        # Anarchist|Anarchists|other uses|Anarchist (disambiguation)
        for dest_ttl in map(tokenlist_gettext, args[2::2]):
            yield HATNOTE_REDIRECT, dest_ttl
    elif tmpl == 'Redirect-multi':
        # Redirect-multi|2|Oscars|The Oscar|other uses|Oscar (disambiguation)
        try:
            skip = int(tokenlist_string(args[1]))
        except (TypeError, ValueError):
            return
        for dest_ttl in map(tokenlist_gettext, args[2::2]):
            yield HATNOTE_REDIRECT, dest_ttl
    elif tmpl == 'Distinguish' or tmpl == 'Redirect-distinguish-text':
        # distinguish|text=[[Lucius Appuleius Saturninus]], a Roman demagogue, or others with the name Apuleius or [[Appuleia (gens)|Appuleius]]
        for tl in args:
            if (
                tmpl == 'Redirect-distinguish-text'
                or tokenlist_startswith(tl, 'text=')
                or tokenlist_startswith(tl, 'Text=')
            ):
                for link_inner in tokenlist_links(tl):
                    dest = link_inner.split('|', maxsplit=1)[0]
                    yield HATNOTE_DISTINGUISH, dest
            else:
                yield HATNOTE_DISTINGUISH, tokenlist_gettext(tl)
    elif tmpl == 'Redirect-distinguish':
        # ethyne|ethane|ethene
        for dest_ttl in map(tokenlist_gettext, args[1:]):
            yield HATNOTE_DISTINGUISH, dest_ttl
    elif tmpl == 'Redirect-distinguish-for':
        # Phoebus|Phobos (mythology)|other uses|Phoebus (disambiguation)
        if len(args) > 1:
            yield HATNOTE_DISTINGUISH, tokenlist_gettext(args[1])
            for dest_ttl in map(tokenlist_gettext, args[3::2]):
                yield HATNOTE_FOR, dest_ttl
    elif tmpl == 'About-distinguish':
        if len(args) > 1:
            yield HATNOTE_DISTINGUISH, tokenlist_gettext(args[1])
    elif tmpl == 'About-distinguish-text':
        # the sub-group of the Semitic languages native to Mesopotamia and the Levant|[[Amharic]], the Semitic language spoken in [[Ethiopia]]
        for tl in args[1:]:
            for link_inner in tokenlist_links(tl):
                dest = link_inner.split('|', maxsplit=1)[0]
                yield HATNOTE_DISTINGUISH, dest
    elif tmpl == 'For-text':
        for tl in args[1:]:
            for link_inner in tokenlist_links(tl):
                dest = link_inner.split('|', maxsplit=1)[0]
                yield HATNOTE_DISTINGUISH, dest
    elif tmpl in ('Other uses', 'Other people', 'About other people', 'Hatnote', 'Technical reasons'):
        return
    elif tmpl in ('Redirect-several',):
        return  # too complicated

In [None]:
LOG_PARTITION_SIZE = 16
PARTITION_SIZE = 1 << LOG_PARTITION_SIZE
NUM_PARTITIONS = math.ceil(N / PARTITION_SIZE)

In [26]:
def get_hatedges():
    iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
    iterator = map(
        lambda b: zip(
            b["id"].to_numpy(),
            b["ns"].to_numpy(),
            map(operator.attrgetter("is_valid"), b["redirect"]),
            b["text"].to_pylist()
        ),
        iterator
    )
    iterator = itertools.chain.from_iterable(iterator)
    iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
    iterator = enumerate(map(operator.itemgetter(3), iterator))
    filenames = files.hatedge_filenames(NUM_PARTITIONS)
    hatcheck_arr = np.zeros(N, dtype=np.bool_)
    for part_idx, subitr in enumerate(lazy_chunk(iterator, PARTITION_SIZE)):
        edges = PairVec('int32')
        for src_id, text in subitr:
            for line in wikiplain.get_distinguish_hatnotes(text):
                for tag, dest_ttl in parse_hatnote(line):
                    dest_ttl = dest_ttl.split('{{!', maxsplit=1)[0]  # for {{!}}
                    if dest_ttl == "" or '#' in dest_ttl:
                        continue
                    dest_ttl = dest_ttl[0].upper() + dest_ttl[1:]
                    if (dest_id := id_map.get(dest_ttl) or id_map2.get(dest_ttl)) is not None:
                        edges.append(src_id, (tag << 28) | dest_id)
                        hatcheck_arr[src_id] = True
        with open(filenames[part_idx], "wb") as fp:
            np.save(fp, edges.array[:edges.length])
    return hatcheck_arr

In [29]:
hatedge_fnames = glob.glob(files.hatedge_filename_pattern)
try:
    assert set(hatedge_fnames) == set(files.hatedge_filenames(NUM_PARTITIONS))
    with open(files.hatcheck_arr_filename, "rb") as fp:
        hatcheck_arr = np.load(fp)
except Exception as exc:
    hatcheck_arr = get_hatedges()
    with open(files.hatcheck_arr_filename, "wb") as fp:
        np.save(fp, hatcheck_arr)
    hatedge_fnames = glob.glob(files.hatedge_filename_pattern)

  0%|          | 0/238029 [00:00<?, ?it/s]

## Links

In [30]:
def parse_wiki_link(line):
    dest_ttl = line.strip()
    if len(dest_ttl) == 0:
        return None
    dest_ttl = dest_ttl[0].upper() + dest_ttl[1:]
    dest_ttl = dest_ttl.split('|', maxsplit=1)[0]
    dest_ttl = dest_ttl.split('#', maxsplit=1)[0]
    return dest_ttl

In [32]:
combined_id_map = ChainMap(id_map, id_map2)

In [18]:
N

6806227

### Edge format

- `edges_{n}.npz` stores the outgoing links from `PARITION_SIZE*n ..< PARTITION_SIZE*(n+1)`
- These are stored in a list where element `i` contains the links out to `PARITION_SIZE*i ..< PARTITION_SIZE*(i+1)`

In [33]:
def chunk(iterable, size):
    """Split an iterable into list chunks of size `n`.
    
    The last chunk can be fewer than `n` elements long, but it won't be empty.
    """
    iterator = iter(iterable)
    while True:
        chunk = list(itertools.islice(iterator, size))
        if chunk:
            yield chunk
        else:
            return

def lazy_chunk(iterable, n):
    """Split an iterable into iterable chunks of size `n`.
    
    The last chunk can be fewer than `n` elements long, but it won't be empty.
    """
    iterator = iter(iterable)
    while True:
        try:
            first = next(iterator)
        except StopIteration:
            return
        yield itertools.chain([first], itertools.islice(iterator, n - 1))

In [34]:
ORDER_TAG_BITS = 3
NUM_ORDER_TAGS = 1 << ORDER_TAG_BITS
DEST_ID_BITS = 31 - ORDER_TAG_BITS
DEST_ID_MASK = (1 << DEST_ID_BITS) - 1
def get_edges():
    in_degree = np.zeros(N, dtype=np.int32)
    out_degree = np.zeros(N, dtype=np.int32)
    with tqdm(position=1) as progress:
        iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
        iterator = map(
            lambda b: zip(
                b["id"].to_numpy(),
                b["ns"].to_numpy(),
                map(operator.attrgetter("is_valid"), b["redirect"]),
                b["text"].to_pylist()
            ),
            iterator
        )
        iterator = itertools.chain.from_iterable(iterator)
        iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
        iterator = enumerate(map(operator.itemgetter(3), iterator))
        filenames = files.edge_filenames(NUM_PARTITIONS)
        order_tags = np.arange(NUM_ORDER_TAGS, dtype=np.int32)[::-1] << DEST_ID_BITS
        for part_idx, subitr in enumerate(lazy_chunk(iterator, PARTITION_SIZE)):
            edges = [PairVec('int32') for _ in range(0, N, PARTITION_SIZE)]
            for src_id, text in subitr:
                link_idx = 0
                for link in wikiplain.get_links(text):
                    dest_ttl = parse_wiki_link(link)
                    if dest_ttl:
                        dest_id = id_map.get(dest_ttl)
                        dest_id = dest_id if (dest_id is not None) else id_map2.get(dest_ttl)
                        if dest_id is not None:
                            partition = dest_id >> LOG_PARTITION_SIZE
                            edges[partition].append(src_id, order_tags[link_idx] | dest_id)
                            in_degree[dest_id] += 1
                            out_degree[src_id] += 1
                            progress.update()
                            link_idx = min(NUM_ORDER_TAGS - 1, link_idx + 1)
            with open(filenames[part_idx], "wb") as fp:
                np.savez(fp, *([vec.array[:vec.length] for vec in edges]))
    return in_degree, out_degree

In [35]:
edge_fnames = glob.glob(files.edge_filename_pattern)
try:
    assert set(edge_fnames) == set(files.edge_filenames(NUM_PARTITIONS))
    with open(files.in_degree_filename, "rb") as fp:
        in_degree = np.load(fp)
    with open(files.out_degree_filename, "rb") as fp:
        out_degree = np.load(fp)
    # for fname in edge_fnames:
    #    with open(fname, "rb") as fp:
    #         assert len(pickle.load(fp)) == NUM_PARTITIONS
except Exception as exc:
    print(exc)
    in_degree, out_degree = get_edges()
    edge_fnames = glob.glob(files.edge_filename_pattern)
    with open(files.in_degree_filename, "wb") as fp:
        np.save(fp, in_degree)
    with open(files.out_degree_filename, "wb") as fp:
        np.save(fp, out_degree)




0it [00:00, ?it/s]

  0%|          | 0/238029 [00:00<?, ?it/s]

#### get_edges time to complete

| date | links/s | time |
| :--- | :------ | :--- |
| 2024-02-06 | 101000 | 46min 48s
| 2024-05-14 | 62500 | 76min 39s
| 2024-07-01 | 77000 | 62min 56s

2024-05-14: `ORDER_BITS` and `link_idx` logic was added

In [22]:
# def get_dab_array():
#     result = np.zeros(N, dtype=np.bool8)
#     dab_proc = subprocess.Popen(
#         ["wikiplain", "--fraction", "1", "-c", "only-dab", "--ns", "0", files.enwiki_database_filename],
#         stdout=subprocess.PIPE,
#         stderr=subprocess.PIPE
#     )
#     iterator = make_links_iter(dab_proc.stdout)
#     iterator = tqdm(iterator, position=0, total=len(id_map))
#     iterator = map(lambda pair: (pair[0].decode("utf-8"), pair[1]), iterator)
#     for n, subitr in enumerate(lazy_chunk(iterator, PARTITION_SIZE)):
#         for ttl, text in subitr:
#             src_id = id_map[ttl]
#             if len(text) > 0:
#                 result[src_id] = True
#     return result

In [23]:
# LDF = pl.scan_parquet(files.enwiki_parquet_filename)

In [24]:
# try:
#     with open(files.dab_array_filename, "rb") as fp:
#         dab_array = pickle.load(fp)
# except Exception as exc:
#     print(exc)
#     dab_array = get_dab_array()
#     with open(files.dab_array_filename, "wb") as fp:
#         pickle.dump(dab_array, fp)

In [36]:
def compute_adjacency_matrix_slice(partition, progress):
    """Computes the slice of the adjacency matrix A starting at row p*S and ending before row (p+1)*S
    
    p=partition, S=PARTITION_SIZE, and A is defined so that
    A @ np.eye(N)[i] = v, a probability vector where
        v[j] = out-degree(i) > 0 | count((i,j) in E) / out-degree(i)
               otherwise         | 0
    """
    origin_row = partition * PARTITION_SIZE
    n_rows = min(PARTITION_SIZE, N - origin_row)
    index_arrs = []
    value_arrs = []
    pkey = f'arr_{partition}'
    for fname in glob.glob(files.edge_filename_pattern):
        with np.load(fname) as npz:
            vec = npz[pkey]
        vec[:, 1] &= DEST_ID_MASK  # remove order_tag
        # vec is
        #  [[src_id_0, dest_id_0],
        #   [src_id_1, dest_id_1],
        #   ...
        #  ]
        # Sort by (src,dest), make unique and get counts
        key_arr = (vec[:, 0].astype('int64') << 32) | vec[:, 1]
        _, order, count = np.unique(key_arr, return_index=True, return_counts=True)
        vec = vec[order]
        # Normalize `count` based on (src,)
        count = count.astype('float64') / out_degree[vec[:, 0]]
        index_arrs.append(vec)
        value_arrs.append(count)
        progress.update()
    index_arr = np.vstack(index_arrs)
    matrix_slice = scipy.sparse.csr_array(
        (np.hstack(value_arrs), (index_arr[:, 1] - origin_row, index_arr[:, 0])),
        shape=(n_rows, N),
        dtype=np.float64
    )
    return matrix_slice

In [37]:
with tqdm(total=NUM_PARTITIONS**2) as progress:
    for partition in range(NUM_PARTITIONS):
        adj_matrix = compute_adjacency_matrix_slice(partition, progress)
        scipy.sparse.save_npz(files.adjacency_filename(partition), adj_matrix, compressed=False)

  0%|          | 0/11025 [00:00<?, ?it/s]

In [38]:
np.quantile(out_degree, [0, 0.1, 0.5, 0.9, 0.99, 0.999, 1]).astype(int)

array([    0,     5,    19,    85,   412,  1198, 15252])

In [39]:
log_out_degree = np.log(out_degree + 2)

In [40]:
log_out_degree /= log_out_degree.sum()

### Global PageRank

The initial rank is a column vector $\mathbf{r}$ = $\frac{1}{N} \left( \mathbf{\vec{1}} \right)$

The transition matrix $\mathbf{M}$ is N x N; each column represents a source, and each row represents a destination.
$\mathbf{M}_{ij} = P(\text{next}=i\,|\,\text{current}=j)$. Each column **must** sum to 1 for the calculation to be stable, so if page $j$ contains no links, it is treated as if it had a link to every page.

The power method iteratively computes better ranks: $\mathbf{r'} = (1 - \alpha) \mathbf{M}\mathbf{r} + \frac{\alpha}{N}$

### Personalized PageRank

Personalized PageRank uses a preference vector $\mathbf{p}$ in place of the uniform $\frac{1}{N}$ for _teleportation_. Pages with no out-links still use a uniform distribution. The initial rank can be any vector, because of the converging property of the power method (explanation at https://mathworld.wolfram.com/Eigenvector.html)

### Ending iteration

At each iteration, we calculate the [perplexity](https://en.wikipedia.org/wiki/Perplexity) of the PageRank distribution, where perplexity is defined as 2 raised to the [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) of the PageRank distribution, i.e., $2^{H(PR)}$. The initial guess is at maximum entropy, so the first iteration causes perplexity to decrease. Later iterations may change perplexity in either direction; we stop when the change is below a certain threshold.

In [41]:
def perplexity(distribution):
    return np.power(2, np.sum(-distribution * np.log2(distribution)))

def personalized_page_rank(preference, threshold=1, random_jump_prob=0.15):
    current_rank = np.ones(N, dtype=np.float64) / N
    next_rank = np.zeros(N, dtype=np.float64)
    # iteratively update current_rank
    edge_follow_prob = 1 - random_jump_prob
    prev_perplexity = float('inf')
    current_perplexity = perplexity(current_rank)
    current_iter = 0
    iter_start = time.time()
    print("Itr# | ΔPerplexity     | Seconds")
    while abs(prev_perplexity - current_perplexity) > threshold:
        current_iter += 1
        next_rank[:] = random_jump_prob * preference
        # update destinations from non-sink nodes (N x N times N x 1 -> N x 1)
        spread_probs = np.vstack([
            adjacency_matrix_slice.dot(current_rank[:, np.newaxis])
            for adjacency_matrix_slice in map(scipy.sparse.load_npz, files.adjacency_filenames(NUM_PARTITIONS))
        ])
        next_rank += edge_follow_prob * spread_probs[:, 0]  # make column vector 1-D
        # update destinations from sink nodes
        next_rank[:] += edge_follow_prob * current_rank[out_degree == 0].sum() / N
        # copy `next_rank` values into `current_rank``
        current_rank[:] = next_rank
        # --
        # compute perplexity and progress
        prev_perplexity = current_perplexity
        current_perplexity = perplexity(current_rank)
        next_iter_start = time.time()
        print("{:<3d}    {:<15.6f}   {:.3f}".format(current_iter,
                                                    current_perplexity - prev_perplexity,
                                                    next_iter_start - iter_start))
        iter_start = next_iter_start

    title_df = pl.DataFrame({
        "title": id_map.keys(),
        "node_id": id_map.values(),
    })
    df = pl.DataFrame({
        "value": next_rank,
        "in_deg": in_degree,
        "out_deg": out_degree,
    })
    df = (
        df.with_row_count(name="node_id")
        .with_columns(pl.col("node_id").cast(pl.Int64))
        .join(title_df, on="node_id", how="left")
    )
    df = df.select("title", "value", "in_deg", "out_deg")
    return df

In [42]:
# Run until perplexity changes by less than 1
PR = personalized_page_rank(log_out_degree)

Itr# | ΔPerplexity     | Seconds
1      -6073391.126198   5.526
2      144200.081121     4.078
3      -38439.087290     3.701
4      -2527.114476      3.601
5      -8461.559135      3.540
6      -2811.155265      3.381
7      -3236.445542      3.533
8      -1700.729249      3.487
9      -1473.381168      3.463
10     -927.252160       3.456
11     -725.435128       3.580
12     -489.155403       3.431
13     -372.249495       3.480
14     -257.465879       3.446
15     -194.636416       3.491
16     -136.994864       3.436
17     -103.115731       3.503
18     -73.490105        3.509
19     -55.399066        3.480
20     -39.718448        3.482
21     -30.075870        3.494
22     -21.660356        3.504
23     -16.469271        3.497
24     -11.905127        3.467
25     -9.095258         3.471
26     -6.589565         3.596
27     -5.060340         3.470
28     -3.672664         3.433
29     -2.834366         3.459
30     -2.059805         3.455
31     -1.597703         3.488
32    

In [43]:
PR.write_parquet(files.pagerank_parquet_filename)

In [44]:
PR_sorted = PR.sort('value', descending=True)

In [45]:
pager(PR_sorted.slice(0, 2000), 20)

interactive(children=(Dropdown(description='page', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …

<function nbhelpers.polars.pager.<locals>.<lambda>(page)>

In [46]:
searcher(
    PR_sorted.slice(0, 200000).with_columns(pl.Series("rank", range(200000))).select(["rank", *PR_sorted.columns]),
    ['title'],
    20
)

interactive(children=(Text(value='', description='q'), Output()), _dom_classes=('widget-interact',))

<function nbhelpers.polars.searcher.<locals>.searcher_run(q)>