## PageRank on Wikipedia

### Prerequisites

1. Download a dump of Wikipedia's articles, named `enwiki-{date_string}-pages-articles-multistream.xml.bz2`
2. Download the `enwiki-{date_string}-pages-articles-multistream-index.txt.bz2` file
3. Move those files into the same folder, removing the `enwiki-{date_string}` prefix
4. Process the `xml.bz2` file into a Parquet file using `wikiplain.load_bz2`

In [1]:
import asyncio
import glob
import gzip
import io
import itertools
import json
import math
import operator
import os
import pickle
import random
import re
import shutil
import socket
import struct
import subprocess
import sys
import tarfile
import time
from collections import ChainMap, defaultdict, deque
from contextlib import asynccontextmanager
from dataclasses import dataclass
from datetime import datetime
from enum import Enum, auto
from functools import lru_cache, partial
from urllib.parse import urlencode, urlsplit, quote as urlquote, unquote as urlunquote
from typing import Any, Awaitable, Callable, Literal, TypeVar

import numpy as np
import pyarrow.parquet as pq
import polars as pl
import sqlalchemy as sa
import scipy.sparse
from dotenv import load_dotenv
from ipywidgets import interact
from sqlalchemy import create_engine
from sqlalchemy.sql import select, text as sqltext
from tqdm.auto import tqdm
from arsenal.datastructures.unionfind import UnionFind

import wikiplain

In [2]:
load_dotenv()

True

In [3]:
pl.Config.set_fmt_str_lengths(160)

polars.config.Config

In [4]:
class PageRankFiles:
    def __init__(self, date_string):
        self.date_string = date_string
        self.enwiki_dir = f"{os.environ['ENWIKI_DIR']}/{date_string}"
        self.parquet_dir = os.environ.get('ENWIKI_PARQUET_DIR', self.enwiki_dir)
        try:
            os.mkdir(f"{self.enwiki_dir}/pagerank")
        except FileExistsError:
            pass
    
    @property
    def enwiki_parquet_filename(self):
        return f"{self.parquet_dir}/enwiki_{self.date_string}.parquet"
    
    @property
    def encategories_database_uri(self):
        return f"sqlite:///{self.enwiki_dir}/encategories.db"

    @property
    def nub_filename(self):
        return f"{self.enwiki_dir}/pagerank/nub.pkl"
    
    @property
    def id_maps_filename(self):
        return f"{self.enwiki_dir}/pagerank/id_maps.pkl"
    
    @property
    def dense_id_arr_filename(self):
        return f"{self.enwiki_dir}/pagerank/dense_id_arr.pkl"
    
    @property
    def edge_filename_pattern(self):
        return f"{self.enwiki_dir}/pagerank/edges_*.pkl"
    
    def edge_filenames(self, num_partitions):
        return [
            f"{self.enwiki_dir}/pagerank/edges_{i}.pkl"
            for i in range(num_partitions)
        ]

    @property
    def in_degree_filename(self):
        return f"{self.enwiki_dir}/pagerank/in_degree.pkl"
    
    @property
    def out_degree_filename(self):
        return f"{self.enwiki_dir}/pagerank/out_degree.pkl"
    
    @property
    def dab_array_filename(self):
        return f"{self.enwiki_dir}/pagerank/dab_array.pkl"
    
    def adjacency_filename(self, partition):
        return f"{self.enwiki_dir}/pagerank/adjacency_{partition}.npz"
    
    def adjacency_filenames(self, num_partitions):
        return [self.adjacency_filename(i) for i in range(num_partitions)]

In [5]:
files = PageRankFiles("20230301")

### Find title collisions

1. There are some pages with the same title - I think this is caused by pages deleted and recreated while the snapshot is in progress

In [6]:
pqf = pq.ParquetFile(files.enwiki_parquet_filename)

In [7]:
def get_overwritten():
    overwritten = set()
    timestamp_map = {}
    article_ids = {}
    pqf_size = 0
    for batch in tqdm(pqf.iter_batches(batch_size=100), total=pqf.num_row_groups):
        for aid, ns, ttl, tm in zip(batch["id"].to_numpy(), batch["ns"].to_numpy(), batch["title"].to_pylist(), batch["timestamp"].to_pylist()):
            pqf_size += 1
            if ns != 0:
                continue
            tm = np.datetime64(tm)
            other_id = article_ids.setdefault(ttl, aid)
            if other_id != aid:
                if (timestamp_map[ttl], other_id) < (tm, aid):
                    print(f"{ttl!r}: {aid} > {other_id}")
                    overwritten.add(other_id)
                    article_ids[ttl] = aid
                    timestamp_map[ttl] = tm
                else:
                    print(f"{ttl!r}: {other_id} > {aid}")
                    overwritten.add(aid)
            else:
                timestamp_map[ttl] = tm
    return overwritten, pqf_size

try:
    with open(files.nub_filename, "rb") as fp:
        overwritten, pqf_size = pickle.load(fp)
except Exception:
    overwritten, pqf_size = get_overwritten()
    with open(files.nub_filename, "wb") as fp:
        pickle.dump((overwritten, pqf_size), fp)

### Build representation of articles/links as a graph

1. Create `id_map` from non-redirecting article titles to node number, and `id_map2` from redirecting article titles to node number
2. Use `wikiplain` to extract link titles, and use above maps to convert to (src_id, dest_id) pairs

In [8]:
class Vec:
    def __init__(self, dtype):
        self.array = np.ndarray((1024,), dtype=dtype)
        self.length = 0
    
    @property
    def capacity(self):
        return self.array.shape[0]

    def append(self, v):
        idx = self.length
        if idx >= self.capacity:
            addsz = max(2, self.capacity)
            self.array = np.hstack((self.array, np.zeros((addsz,), dtype=self.array.dtype)))
        self.array[idx] = v
        self.length += 1

In [9]:
def get_id_maps():
    redirect_group_map = UnionFind()
    id_map = {}
    redirect_lst = []
    dense_ids = Vec(dtype=np.int64)
    for batch in tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size // 100)):
        for aid, ns, ttl, redir in zip(batch["id"].to_numpy(), batch["ns"].to_numpy(), batch["title"].to_pylist(), batch["redirect"].to_pylist()):
            if ns != 0 or aid in overwritten:
                continue
            if redir is not None:
                redirect_group_map.union(ttl, redir)
                redirect_lst.append(ttl)
            else:
                assert ttl not in id_map, f"Expected unique titles, got second instance of {ttl}"
                dense_ids.append(aid)
                id_map[ttl] = len(id_map)
    id_map2 = {}
    for group in redirect_group_map.classes():
        centers = [ttl for ttl in group if ttl in id_map]
        if len(centers) == 0:
            continue
        assert len(centers) == 1, str(centers)
        for ttl in group:
            if ttl != centers[0]:
                id_map2[ttl] = id_map[centers[0]]
    return id_map, id_map2, dense_ids.array[:dense_ids.length]

try:
    with open(files.id_maps_filename, "rb") as fp:
        id_map, id_map2 = pickle.load(fp)
    with open(files.dense_id_arr_filename, "rb") as fp:
        dense_id_arr = pickle.load(fp)
except Exception:
    id_map, id_map2, dense_id_arr = get_id_maps()
    with open(files.id_maps_filename, "wb") as fp:
        pickle.dump((id_map, id_map2), fp)
    with open(files.dense_id_arr_filename, "wb") as fp:
        pickle.dump(dense_id_arr, fp)

In [10]:
len(id_map), len(id_map2)

(6625358, 10408919)

In [11]:
list(itertools.islice(iter(id_map.keys()), 10))

['Anarchism',
 'Albedo',
 'A',
 'Alabama',
 'Achilles',
 'Abraham Lincoln',
 'Aristotle',
 'An American in Paris',
 'Academy Award for Best Production Design',
 'Academy Awards']

In [12]:
list(itertools.islice(iter(id_map2.keys()), 10))

['A11y',
 'Digital accessibility',
 'Accessible computing',
 'Open Accessibility Framework',
 'Accessible Computing',
 'AccessibleComputing',
 'Afghan history',
 'Afghanistan history',
 'Afghanistan/History',
 'Afghan History']

In [13]:
localsizes = (pl.DataFrame([(id(value), name, sys.getsizeof(value)) for name, value in locals().items()],
                          columns=['id', 'name', 'size'])
              .groupby('id')
              .agg(pl.max("size"), pl.col("name").apply(lambda ser: ser.to_list()))
              .sort('size', descending=True)
              .head(50)
             )
localsizes

  localsizes = (pl.DataFrame([(id(value), name, sys.getsizeof(value)) for name, value in locals().items()],


id,size,name
i64,i64,list[str]
140568502228480,335544408,"[""id_map2""]"
140568502348032,335544408,"[""id_map""]"
94030689935712,1859,"[""_i4""]"
94033791202928,1654,"[""_i9""]"
94033791160288,1309,"[""_i7""]"
94030649687744,1072,"[""TypeVar""]"
94030677887280,1072,"[""tqdm""]"
94030673125936,1072,"[""_3""]"
94030689909408,1072,"[""UnionFind""]"
94030649450528,1072,"[""Enum""]"


In [14]:
# %reset -f out

In [15]:
class PairVec:
    def __init__(self, dtype):
        self.array = np.ndarray((1024, 2), dtype=dtype)
        self.length = 0
    
    @property
    def capacity(self):
        return self.array.shape[0]

    def append(self, v1, v2):
        idx = self.length
        if idx >= self.capacity:
            addsz = max(2, self.capacity)
            self.array = np.vstack((self.array, np.zeros((addsz, 2), dtype=self.array.dtype)))
        self.array[idx] = [v1, v2]
        self.length += 1

In [16]:
def parse_wiki_link(line):
    dest_ttl = line.strip()
    if len(dest_ttl) == 0:
        return None
    dest_ttl = dest_ttl[0].upper() + dest_ttl[1:]
    dest_ttl = dest_ttl.split('|', maxsplit=1)[0]
    dest_ttl = dest_ttl.split('#', maxsplit=1)[0]
    return dest_ttl

In [17]:
LOG_PARTITION_SIZE = 16
PARTITION_SIZE = 1 << LOG_PARTITION_SIZE
N = len(id_map)
NUM_PARTITIONS = math.ceil(N / PARTITION_SIZE)

In [18]:
combined_id_map = ChainMap(id_map, id_map2)

In [19]:
N

6625358

### Edge format

- `edges_{n}.pkl` stores the outgoing links from `PARITION_SIZE*n ..< PARTITION_SIZE*(n+1)`
- These are stored in a list where element `i` contains the links out to `PARITION_SIZE*i ..< PARTITION_SIZE*(i+1)`

In [20]:
def chunk(iterable, size):
    """Split an iterable into list chunks of size `n`.
    
    The last chunk can be fewer than `n` elements long, but it won't be empty.
    """
    iterator = iter(iterable)
    while True:
        chunk = list(itertools.islice(iterator, size))
        if chunk:
            yield chunk
        else:
            return

def lazy_chunk(iterable, n):
    """Split an iterable into iterable chunks of size `n`.
    
    The last chunk can be fewer than `n` elements long, but it won't be empty.
    """
    iterator = iter(iterable)
    while True:
        try:
            first = next(iterator)
        except StopIteration:
            return
        yield itertools.chain([first], itertools.islice(iterator, n - 1))

In [21]:
def pager(DF, size):
    num_pages = math.ceil(DF.shape[0] / size)
    page_input = list(range(num_pages)) if num_pages < 1000 else (0, num_pages, 1)
    return interact(lambda page: DF.slice(page*size, size), page=page_input)

def searcher(DF, columns, page_size):
    def searcher_run(q):
        mask = (
            DF
            .select([pl.col(c).str.contains(q) for c in columns])
            .max(axis=1)
        )
        return DF.filter(mask).slice(0, page_size)
    return interact(searcher_run, q="")

In [22]:
in_degree = np.zeros(N, dtype=np.int32)
out_degree = np.zeros(N, dtype=np.int32)
# article_len_arr = np.zeros(N, dtype=np.int32)
def get_edges():
    with tqdm(position=1) as progress:
        iterator = tqdm(pqf.iter_batches(batch_size=100), total=math.ceil(pqf_size / 100))
        iterator = map(
            lambda b: zip(
                b["id"].to_numpy(),
                b["ns"].to_numpy(),
                map(operator.attrgetter("is_valid"), b["redirect"]),
                b["text"].to_pylist()
            ),
            iterator
        )
        iterator = itertools.chain.from_iterable(iterator)
        iterator = filter(lambda e: not e[2] and e[1] == 0 and e[0] not in overwritten, iterator)
        iterator = enumerate(map(operator.itemgetter(3), iterator))
        filenames = files.edge_filenames(NUM_PARTITIONS)
        for part_idx, subitr in enumerate(lazy_chunk(iterator, PARTITION_SIZE)):
            edges = [PairVec('int32') for _ in range(0, N, PARTITION_SIZE)]
            for src_id, text in subitr:
                for link in wikiplain.get_links(text):
                    dest_ttl = parse_wiki_link(link)
                    if (dest_id := id_map.get(dest_ttl) or id_map2.get(dest_ttl)) is not None:
                        partition = dest_id >> LOG_PARTITION_SIZE
                        edges[partition].append(src_id, dest_id)
                        in_degree[dest_id] += 1
                        out_degree[src_id] += 1
                        progress.update()
            with open(filenames[part_idx], "wb") as fp:
                pickle.dump([vec.array[:vec.length] for vec in edges], fp)

In [23]:
# def get_dab_array():
#     result = np.zeros(N, dtype=np.bool8)
#     dab_proc = subprocess.Popen(
#         ["wikiplain", "--fraction", "1", "-c", "only-dab", "--ns", "0", files.enwiki_database_filename],
#         stdout=subprocess.PIPE,
#         stderr=subprocess.PIPE
#     )
#     iterator = make_links_iter(dab_proc.stdout)
#     iterator = tqdm(iterator, position=0, total=len(id_map))
#     iterator = map(lambda pair: (pair[0].decode("utf-8"), pair[1]), iterator)
#     for n, subitr in enumerate(lazy_chunk(iterator, PARTITION_SIZE)):
#         for ttl, text in subitr:
#             src_id = id_map[ttl]
#             if len(text) > 0:
#                 result[src_id] = True
#     return result

In [24]:
# LDF = pl.scan_parquet(files.enwiki_parquet_filename)

In [41]:
edge_fnames = glob.glob(files.edge_filename_pattern)
try:
    assert set(edge_fnames) == set(files.edge_filenames(NUM_PARTITIONS))
    for fname in edge_fnames:
        with open(fname, "rb") as fp:
            assert len(pickle.load(fp)) == NUM_PARTITIONS
    with open(files.in_degree_filename, "rb") as fp:
        in_degree = pickle.load(fp)
    with open(files.out_degree_filename, "rb") as fp:
        out_degree = pickle.load(fp)
except Exception as exc:
    print(exc)
    get_edges()
    edge_fnames = glob.glob(files.edge_filename_pattern)
    with open(files.in_degree_filename, "wb") as fp:
        pickle.dump(in_degree, fp)
    with open(files.out_degree_filename, "wb") as fp:
        pickle.dump(out_degree, fp)

In [None]:
# try:
#     with open(files.dab_array_filename, "rb") as fp:
#         dab_array = pickle.load(fp)
# except Exception as exc:
#     print(exc)
#     dab_array = get_dab_array()
#     with open(files.dab_array_filename, "wb") as fp:
#         pickle.dump(dab_array, fp)

In [42]:
def compute_adjacency_matrix_slice(partition, progress):
    """Computes the slice of the adjacency matrix A starting at row p*S and ending before row (p+1)*S
    
    p=partition, S=PARTITION_SIZE, and A is defined so that
    A @ np.eye(N)[i] = v, a probability vector where
        v[j] = out-degree(i) > 0 | count((i,j) in E) / out-degree(i)
               otherwise         | 0
    """
    origin_row = partition * PARTITION_SIZE
    n_rows = min(PARTITION_SIZE, N - origin_row)
    index_arrs = []
    value_arrs = []
    for fname in glob.glob(files.edge_filename_pattern):
        with open(fname, "rb") as fp:
            vec = pickle.load(fp)[partition]
        # vec is
        #  [[src_id_0, dest_id_0],
        #   [src_id_1, dest_id_1],
        #   ...
        #  ]
        # Sort by (src,dest), make unique and get counts
        key_arr = (vec[:, 0].astype('int64') << 32) | vec[:, 1]
        _, order, count = np.unique(key_arr, return_index=True, return_counts=True)
        vec = vec[order]
        # Normalize `count` based on (src,)
        count = count.astype('float64') / out_degree[vec[:, 0]]
        index_arrs.append(vec)
        value_arrs.append(count)
        progress.update()
    index_arr = np.vstack(index_arrs)
    matrix_slice = scipy.sparse.csr_array(
        (np.hstack(value_arrs), (index_arr[:, 1] - origin_row, index_arr[:, 0])),
        shape=(n_rows, N),
        dtype=np.float64
    )
    return matrix_slice

In [43]:
with tqdm(total=NUM_PARTITIONS**2) as progress:
    for partition in range(NUM_PARTITIONS):
        adj_matrix = compute_adjacency_matrix_slice(partition, progress)
        scipy.sparse.save_npz(files.adjacency_filename(partition), adj_matrix, compressed=False)

  0%|          | 0/10404 [00:00<?, ?it/s]

In [44]:
np.quantile(out_degree, [0, 0.1, 0.5, 0.9, 0.99, 0.999, 1]).astype(int)

array([    0,     8,    28,   104,   460,  1466, 25044])

In [45]:
log_out_degree = np.log(out_degree + 2)

In [46]:
log_out_degree /= log_out_degree.sum()

### Global PageRank

The initial rank is a column vector $\mathbf{r}$ where $\mathbf{r}_i = \frac{1}{N}$

The transition matrix $\mathbf{M}$ is N x N; each column represents a source, and each row represents a destination.
$\mathbf{M}_{ij} = P(\text{next}=i\,|\,\text{current}=j)$. Each column **must** sum to 1 for the calculation to be stable, so if page $j$ contains no links, it is treated as if it had a link to every page.

The power method iteratively computes better ranks: $\mathbf{r'} = (1 - \alpha) \mathbf{M}\mathbf{r} + \frac{\alpha}{N}$

### Personalized PageRank

Personalized PageRank uses a preference vector $\mathbf{p}$ in place of the uniform $\frac{1}{N}$ for _teleportation_. Pages with no out-links still use a uniform distribution. The initial rank can be any vector, because of the converging property of the power method (explanation at https://mathworld.wolfram.com/Eigenvector.html)

### Ending iteration

At each iteration, we calculate the [perplexity](https://en.wikipedia.org/wiki/Perplexity) of the PageRank distribution, where perplexity is defined as 2 raised to the [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) of the PageRank distribution, i.e., $2^{H(PR)}$. The initial guess is at maximum entropy, so the first iteration causes perplexity to decrease. Later iterations may change perplexity in either direction; we stop when the change is below a certain threshold.

In [47]:
def perplexity(distribution):
    return np.power(2, np.sum(-distribution * np.log2(distribution)))

def personalized_page_rank(preference, threshold=1, random_jump_prob=0.15):
    current_rank = np.ones(N, dtype=np.float64) / N
    next_rank = np.zeros(N, dtype=np.float64)
    # iteratively update current_rank
    edge_follow_prob = 1 - random_jump_prob
    prev_perplexity = float('inf')
    current_perplexity = perplexity(current_rank)
    current_iter = 0
    iter_start = time.time()
    print("Itr# | ΔPerplexity     | Seconds")
    while abs(prev_perplexity - current_perplexity) > threshold:
        current_iter += 1
        next_rank[:] = random_jump_prob * preference
        # update destinations from non-sink nodes (N x N times N x 1 -> N x 1)
        spread_probs = np.vstack([
            adjacency_matrix_slice.dot(current_rank[:, np.newaxis])
            for adjacency_matrix_slice in map(scipy.sparse.load_npz, files.adjacency_filenames(NUM_PARTITIONS))
        ])
        next_rank += edge_follow_prob * spread_probs[:, 0]  # make column vector 1-D
        # update destinations from sink nodes
        next_rank[:] += edge_follow_prob * current_rank[out_degree == 0].sum() / N
        # copy `next_rank` values into `current_rank``
        current_rank[:] = next_rank
        # --
        # compute perplexity and progress
        prev_perplexity = current_perplexity
        current_perplexity = perplexity(current_rank)
        next_iter_start = time.time()
        print("{:<3d}    {:<15.6f}   {:.3f}".format(current_iter,
                                                    current_perplexity - prev_perplexity,
                                                    next_iter_start - iter_start))
        iter_start = next_iter_start
    df = pl.DataFrame({
        "title": id_map.keys(), "value": next_rank, "in_deg": in_degree, "out_deg": out_degree,
    })
    return df

In [48]:
# Run until perplexity changes by less than 1
PR = personalized_page_rank(log_out_degree)

Itr# | ΔPerplexity     | Seconds
1      -6590087.373375   41.637
2      -18066.651574     32.785
3      -7991.425620      31.674
4      -3135.253593      30.739
5      -1614.272158      13.630
6      -903.522792       6.233
7      -552.961068       5.260
8      -355.355849       3.696
9      -238.212649       4.155
10     -164.180308       4.471
11     -115.689300       4.014
12     -82.804801        4.218
13     -59.988596        4.146
14     -43.842793        3.895
15     -32.258310        3.761
16     -23.852399        4.413
17     -17.703208        3.900
18     -13.175938        4.191
19     -9.827130         3.918
20     -7.340991         3.844
21     -5.490338         3.616
22     -4.109888         4.532
23     -3.078600         3.718
24     -2.307258         3.636
25     -1.729838         3.504
26     -1.297300         3.531
27     -0.973129         3.804


In [49]:
PR_sorted = PR.sort('value', descending=True)

In [50]:
pager(PR_sorted.slice(0, 2000), 20)

interactive(children=(Dropdown(description='page', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …

<function __main__.pager.<locals>.<lambda>(page)>

In [51]:
searcher(
    PR_sorted.slice(0, 200000).with_columns(pl.Series("rank", range(200000))).select(["rank", *PR_sorted.columns]),
    ['title'],
    20
)

interactive(children=(Text(value='', description='q'), Output()), _dom_classes=('widget-interact',))

<function __main__.searcher.<locals>.searcher_run(q)>

In [33]:
real_words = set()
upper_words = set()
with open("debian-american-english-insane.txt", "r") as fp:
    for w in map(str.rstrip, fp.readlines()):
        if len(w) == 0:
            continue
        if w.islower():
            real_words.add(w)
        else:
            upper_words.add(w.lower())
real_words.difference_update(upper_words)

In [34]:
PR_value = PR.loc[:, "value"].values

In [62]:
def merge_iterators(*iterables, key):
    iterators = [iter(x) for x in iterables]
    n_open = len(iterators)
    is_open = [True for _ in iterators]
    heads = [None for _ in iterators]
    pred_is_open = pipe_through(operator.itemgetter(0), is_open.__getitem__)
    for idx, itr in enumerate(iterators):
        try:
            heads[idx] = next(itr)
        except StopIteration:
            n_open -= 1
            is_open[idx] = False
    while n_open > 0:
        idx, min_val = min(filter(pred_is_open, enumerate(heads)), key=lambda p: key(p[1]))
        yield min_val
        try:
            heads[idx] = next(iterators[idx])
        except StopIteration:
            n_open -= 1
            is_open[idx] = False

In [101]:
class BoundedIter:
    def __init__(self, iterator, stop):
        self.iterator = iterator
        self.curr = 0
        self.stop = stop
    
    @classmethod
    def items(cls, mapping, use_tqdm=False):
        total = len(mapping)
        iterator = iter(tqdm(mapping.items(), total=total)) if use_tqdm else iter(mapping.items())
        return cls(iterator, total)
    
    @classmethod
    def items_by_value_asc(cls, *mappings, use_tqdm=False):
        total = sum(len(e) for e in mappings)
        iterator = merge_iterators(*(e.items() for e in mappings), key=operator.itemgetter(1))
        if use_tqdm:
            iterator = iter(tqdm(iterator, total=total))
        return cls(iterator, total)
    
    def get(self, default=None):
        if self.curr >= self.stop:
            return default, False
        self.curr += 1
        return next(self.iterator), True
    
    def __enter__(self):
        return self
    
    def __exit__(self, _1, _2, _3):
        try:
            next(self.iterator)
        except StopIteration:
            pass

In [58]:
list(itertools.islice(id_map.items(), 10))

[('Anarchism', 0),
 ('Albedo', 1),
 ('A', 2),
 ('Alabama', 3),
 ('Achilles', 4),
 ('Abraham Lincoln', 5),
 ('Aristotle', 6),
 ('An American in Paris', 7),
 ('Academy Award for Best Production Design', 8),
 ('Academy Awards', 9)]

In [154]:
dab_array.sum()

440055

In [74]:
@lru_cache(maxsize=1)
def load_adjacency_slice(partition):
    return scipy.sparse.load_npz(files.adjacency_filenames(NUM_PARTITIONS)[partition])

In [194]:
aid_ex = id_map["List of bus routes in London"]
PR_sorted.loc[np.nonzero(load_adjacency_slice(aid_ex >> LOG_PARTITION_SIZE)[aid_ex % PARTITION_SIZE].multiply(dab_array))[1]]

Unnamed: 0_level_0,value,in_deg,out_deg,title
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4715,2.627924e-08,8.0,120.0,E2
23005,2.221949e-08,4.0,54.0,D6
23911,2.721998e-08,6.0,129.0,E1
136013,2.407886e-08,6.0,73.0,E6
162896,1.762343e-08,0.0,14.0,T33
...,...,...,...,...
4785598,1.903712e-08,0.0,16.0,T32
4852693,1.516573e-08,0.0,8.0,Route 67 (disambiguation)
4852698,1.421268e-08,0.0,6.0,Route 82 (disambiguation)
5158757,2.198999e-08,2.0,44.0,222 (disambiguation)


In [181]:
with BoundedIter.items_by_value_asc(id_map, id_map2, use_tqdm=True) as combined_id_map_itr:
    iter_open = True
    flip_flop = (0, 1)
    with open(f"/media/dylan/easystore/caps/data/enwiki/pagerank/keywords_{flip_flop[1]}.tsv", "w"):
        pass
    while iter_open:
        phrase_article_lst = []
        while iter_open and len(phrase_article_lst) < 5_000_000:
            (ttl, aid), iter_open = combined_id_map_itr.get(default=("",0))
            special = 2 if "(disambiguation)" in ttl else 0
            ttld = ttl.replace("(disambiguation)", "").strip()
            words = getWords(ttld.lower().replace("'", "+").replace("&", "+"))
            if len(words) == 0:
                continue
            if len(words) == 1 and len(words[0]) <= 2:
                continue
            if ttl in id_map2:
                if ttld[0] == ttld[-1] and not ttld[0].isalnum():
                    continue
                if len(words) == 1:
                    if ttld.endswith(".") or ttld.startswith("."):
                        continue
                    if ttld.isupper():
                        orig = PR['title'].values[id_map2[ttl]]
                        if len(ttld) <= len(getWords(orig)):
                            continue
            phrase_article_lst.append((json.dumps(words), aid, "{:.6g}".format(PR_value[aid]), special))
            partition = aid >> LOG_PARTITION_SIZE
            adj = load_adjacency_slice(partition)
            inlinks = adj[aid % PARTITION_SIZE]
            # elementwise multiply() call, because sparse arrays make `*` matrix-multiplication
            # nonzero returns (dim_0_indices, dim_1_indices). inlinks is 1 x N (sparse matrices can't be indexed to vectors)
            if ttl in id_map:
                for aid2 in np.nonzero(inlinks.multiply(dab_array))[1]:
                    ttl2 = PR['title'].values[aid2]
                    ttl2 = ttl2.replace("(disambiguation)", "")
                    words = getWords(ttl2.lower().replace("'", "+").replace("&", "+"))
                    if aid == aid_ex:
                        print((aid2, json.dumps(words), aid, "{:.6g}".format(PR_value[aid]), special | 1))
                    phrase_article_lst.append((json.dumps(words), aid, "{:.6g}".format(PR_value[aid]), special | 1))
            # if len(words) > 1:
            #     for w in words:
            #         if len(w) > 2 and w not in real_words:
            #             phrase_article_lst.append((json.dumps([w]), aid, "{:.6g}".format(PR_value[aid]), special | 1))
        if len(phrase_article_lst) == 0:
            break
        phrase_article_lst.sort(key=operator.itemgetter(0))
        with open(f"/media/dylan/easystore/caps/data/enwiki/pagerank/keywords_{flip_flop[0]}.tsv", "w") as wfile:
            with open(f"/media/dylan/easystore/caps/data/enwiki/pagerank/keywords_{flip_flop[1]}.tsv", "r") as rfile:
                liter = map(lambda row: "\t".join(str(v) for v in row) + "\n", phrase_article_lst)
                riter = iter(rfile.readline, "")
                for row_str in merge_iterators(liter, riter, key=lambda x: x):
                    wfile.write(row_str)
        flip_flop = flip_flop[::-1]
shutil.move(f"/media/dylan/easystore/caps/data/enwiki/pagerank/keywords_{flip_flop[1]}.tsv", f"/media/dylan/easystore/caps/data/enwiki/pagerank/keywords_{flip_flop[0]}.tsv")
shutil.move(f"/media/dylan/easystore/caps/data/enwiki/pagerank/keywords_{flip_flop[0]}.tsv", "/media/dylan/easystore/caps/data/enwiki/pagerank/keywords.tsv")

  0%|          | 0/16666960 [00:00<?, ?it/s]

(11591, '["phoenix"]', 25008, '5.62075e-06', 1)
(85417, '["melrose"]', 25008, '5.62075e-06', 1)
(281448, '["kcac"]', 25008, '5.62075e-06', 1)
(449299, '["fengcheng"]', 25008, '5.62075e-06', 1)
(467388, '["paradise", "valley"]', 25008, '5.62075e-06', 1)
(480368, '["phx"]', 25008, '5.62075e-06', 1)
(548989, '["5", "cs"]', 25008, '5.62075e-06', 1)
(828246, '["inner", "loop"]', 25008, '5.62075e-06', 1)
(1044917, '["sunnyslope"]', 25008, '5.62075e-06', 1)
(1123471, '["arizona", "serial", "killer"]', 25008, '5.62075e-06', 1)
(1158714, '["list", "of", "streets", "named", "after", "martin", "luther", "king", "jr"]', 25008, '5.62075e-06', 1)
(1540670, '["maricopa", "freeway"]', 25008, '5.62075e-06', 1)
(1546743, '["good", "samaritan", "hospital"]', 25008, '5.62075e-06', 1)
(1681923, '["dobbins", "creek"]', 25008, '5.62075e-06', 1)
(1991985, '["phoenix", "airport"]', 25008, '5.62075e-06', 1)
(2052582, '["kool", "fm"]', 25008, '5.62075e-06', 1)
(2140249, '["gateway", "airport"]', 25008, '5.62075e

'/media/dylan/easystore/caps/data/enwiki/pagerank/keywords.tsv'

In [135]:
one_word_freq = apply_chain(
    requests.get('https://www.anc.org/SecondRelease/data/ANC-token-count.txt', verify=False).text,
    io.StringIO,
    partial(pd.read_csv, sep='\t', skipfooter=1, header=None, index_col=0, keep_default_na=False),
    lambda df: dict(df.iloc[:,[1]].itertuples())
)

  result = transform(result)


In [185]:
localsizes = (pd.DataFrame([(id(value), name, sys.getsizeof(value)) for name, value in locals().items()],
                          columns=['id', 'name', 'size'])
              .groupby('id')
              .agg({'size': 'max', 'name': lambda ser: ser.to_list()})
              .sort_values('size', ascending=False)
              .head(50)
             )
localsizes

Unnamed: 0_level_0,size,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
140375909186720,838447930,[PR_sorted2]
140375909185712,832810727,[PR_sorted3]
140373426495040,733991546,[PR_sorted]
140375848156560,681763482,[PR]
140372850353216,335544408,[id_map2]
140379328004544,335544408,[id_map]
140373163787136,126856312,[query_weight_offsets]
140374438396736,126856312,[query_answers]
140376339482160,52228304,[article_query_count]
140373887805904,52228304,[article_extscores]


In [182]:
query_weight_offsets = []
query_weights = Vec(dtype="i4, f4")
article_query_count = np.zeros(N, dtype=int)
with open(f"{files.enwiki_dir}/pagerank/keywords.tsv", "r", encoding="utf-8") as rfile:
    with open(f"{files.enwiki_dir}/wikiwords.json", "w", encoding="utf-8") as wfile:
        wfile.write("[")
        itr = map(lambda s: s.split("\t"), tqdm(rfile))
        for el, group in itertools.groupby(itr, key=operator.itemgetter(0)):
            meta = {}
            for query_s, aid_s, pr_s, special_s in group:
                query = json.loads(query_s)
                aid = int(aid_s)
                pr = float(pr_s)
                special = int(special_s)
                pr_norm = (pr * 1e6)**2
                score = [1, 0.5, 0.2, 0.1][special]
                if len(query) <= 2:
                    prob = np.product([one_word_freq.get(w, 0) for w in query])
                    score *= (1e-7 / max(1e-7, prob)) ** 0.5
                meta.setdefault(aid, (pr_norm, score))
                if pr_norm * score > meta[aid][0] * meta[aid][1]:
                    meta[aid] = (pr_norm, score)
            query_weight_offsets.append(query_weights.length)
            total_pr = sum(v1 for v1, _ in meta.values())
            for aid, (v1, v2) in meta.items():
                query_weights.append((aid, v1 * v2 / total_pr))
                article_query_count[aid] += 1
            if len(query_weight_offsets) > 1:
                wfile.write(",")
            wfile.write(el)
        wfile.write("]")
        query_weight_offsets.append(query_weights.length)

0it [00:00, ?it/s]

In [39]:
SEARCHER_CLI_PORT = 11111

In [188]:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as conn:
    # conn.setsockopt()
    conn.connect(('127.0.0.1', SEARCHER_CLI_PORT))
    wr = conn.makefile(mode='w', encoding="utf-8")
    total = os.stat(f"{files.enwiki_dir}/wikiwords.json").st_size
    conn.send(struct.pack(">I", total))
    with tqdm.wrapattr(open(f"{files.enwiki_dir}/wikiwords.json", "rb"), "read") as fp:
        while True:
            chunk = fp.read(131072)
            if not chunk:
                break
            conn.send(chunk)
    # conn.shutdown(socket.SHUT_WR)
    resp = b''
    piece = conn.recv(4096)
    while piece:
        resp += piece
        if piece.endswith(b'\n'):
            break
        piece = conn.recv(4096)
    query_answers = json.loads(resp)

0it [00:00, ?it/s]

In [189]:
article_extscores = np.zeros(N, dtype=np.float64)
article_extcount = np.zeros(N, dtype=int)
for st, en, v in zip(query_weight_offsets, query_weight_offsets[1:], query_answers):
    for aid, weight in query_weights.array[st:en]:
        article_extscores[aid] += v * weight
        article_extcount[aid] += v

In [190]:
PR_sorted2 = (PR.assign(title=list(id_map.keys()), extscore=article_extscores, extcount=article_extcount)
              .sort_values("extscore", ascending=False))
pager(PR_sorted2.iloc[:2000], 20)

interactive(children=(Dropdown(description='p', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…

<function nbhelpers.pager.<locals>.<lambda>(p)>

In [147]:
query_weights.length

31840734

In [192]:
searcher(PR_sorted2.iloc[:200000].reset_index(drop=True), 'title', 20)

interactive(children=(Text(value='', description='q'), Output()), _dom_classes=('widget-interact',))

<function nbhelpers.searcher.<locals>.searcher_run(q)>

In [195]:
article_extscores_n = article_extscores ** 0.25
PPR = personalized_page_rank(article_extscores_n / article_extscores_n.sum())

Itr# | ΔPerplexity     | Seconds
1      -6525347.388890   39.314
2      -2842.834844      15.478
3      -214.461130       15.371
4      -42.215317        11.194
5      -13.093852        4.275
6      -4.833981         5.181
7      -1.927023         6.612
8      -0.791616         7.226


In [196]:
PPR_sorted = PPR.assign(title=list(id_map.keys())).sort_values('value', ascending=False)

In [197]:
pager(PPR_sorted.iloc[:2000], 20)

interactive(children=(Dropdown(description='p', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…

<function nbhelpers.pager.<locals>.<lambda>(p)>

In [198]:
people_df = pd.read_csv('people__gameppl.tsv', sep='\t', index_col=0)

In [199]:
people_df = people_df.assign(article_id=people_df['original'].apply(combined_id_map.get))

In [200]:
people_df[['original', 'article_id', 'total_views']]

Unnamed: 0_level_0,original,article_id,total_views
escaped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
donald trump,Donald Trump,968234,612086318.0
elizabeth ii,Elizabeth II,1676170,123587458.0
elon musk,Elon Musk,301338,98882645.0
cristiano ronaldo,Cristiano Ronaldo,236516,87726046.0
barack obama,Barack Obama,211964,86718357.0
...,...,...,...
dane dehaan,Dane DeHaan,3290933,6007290.0
atticus shaffer,Atticus Shaffer,2860826,6002764.0
sam harris,Sam Harris,608251,6002712.0
danny mcbride,Danny McBride,1735895,6001988.0


In [201]:
def precision_recall(ids, n):
    df = PPR_sorted.iloc[:n]
    relevant = df.index.isin(ids)
    precision = relevant.mean()
    recall = relevant.sum() / len(ids)
    return precision, recall

In [210]:
precision_recall(people_df['article_id'], 400000)

(0.00826, 0.9889254714157438)

In [211]:
pager(people_df[~people_df['article_id'].isin(PPR_sorted.iloc[:400000].index)], 20)

interactive(children=(Dropdown(description='p', options=(0, 1), value=0), Output()), _dom_classes=('widget-int…

<function nbhelpers.pager.<locals>.<lambda>(p)>

In [219]:
PPR_sorted.to_csv(f'{files.enwiki_dir}/pagerank/ppr-20220720.csv')

In [222]:
def search_ppr(q):
    ql = [s for s in q.split("|") if len(s)]
    result = [unwrap_apply(id_map.get(k), PPR_sorted.index.get_loc) for k in ql]
    return pd.Series(result, index=ql, dtype=np.float64).sort_values()

interact(search_ppr, q="")

interactive(children=(Text(value='', description='q'), Output()), _dom_classes=('widget-interact',))

<function __main__.search_ppr(q)>

In [None]:
engine2 = create_engine(files.encategories_database_uri)


In [102]:
'Eddie Fisher' in set(PR_sorted['title'])

True

In [38]:
id_arr = np.array(list(id_map.values()))
id_arr == np.array(range(N))

array([ True,  True,  True, ...,  True,  True,  True])

In [39]:
id_arr

array([       0,        1,        2, ..., 11593659, 11593660, 11593661])