## Prep notebook

In [50]:
import bz2
import json
import os
import random
import re
import string

import mwparserfromhell
import numpy as np
import pandas as pd
import requests

import findspark
findspark.init('/usr/lib/spark2')

from pyspark.sql import SparkSession

In [2]:
!which python

/usr/lib/anaconda-wmf/bin/python


In [3]:
spark = (
    SparkSession.builder
    .appName('Pyspark notebook (isaacj -- wikitext)')
    .master('yarn')
    .config(
        'spark.driver.extraJavaOptions',
        ' '.join('-D{}={}'.format(k, v) for k, v in {
            'http.proxyHost': 'webproxy.eqiad.wmnet',
            'http.proxyPort': '8080',
            'https.proxyHost': 'webproxy.eqiad.wmnet',
            'https.proxyPort': '8080',
        }.items()))
#    .config('spark.jars.packages', 'graphframes:graphframes:0.6.0-spark2.3-s_2.11')
    .config("spark.driver.memory", "2g")
    .config('spark.dynamicAllocation.maxExecutors', 64)
    .config("spark.executor.memory", "8g")
    .config("spark.executor.cores", 4)
    .config("spark.sql.shuffle.partitions", 256)
    .getOrCreate()
)
spark

## Parameters / Utilities

In [4]:
snapshot = '2020-12'  # data will be current to this date -- e.g., 2020-05 means data is up to 30 April 2020 (at least)
wd_snapshot = '2020-12-07'  # closest Wikidata item-page-link to data snapshot

In [32]:
def getCleanedText(wikitext):
    """Clean/preprocess wikitext for fastText modeling.
    Should work ok for any space-delimited language. Might need to update punctuation / category names.
    
    What it does:
    * Lowercase
    * Removes wiki markup -- e.g., brackets
    * Removes categories (this is mainly to prevent WikiProject categories from bleeding labels into data)
    * Removes extraneous white-space + punctuation
    What it returns:
    * Cleaned, space-delimited tokens as a string
    """
    try:
        wt = mwparserfromhell.parse(wikitext).strip_code().lower()
        return ' '.join([w for w in re.sub('["\'.,?(){}]','', wt).split() if not w.startswith('category:')])
    except Exception:
        return None
    
spark.udf.register('getCleanedText', getCleanedText, 'String')

<function __main__.getCleanedText(wikitext)>

## Gather wikitext data and write to TSV

In [34]:
print_for_hive = False
do_execute = True

query = f"""
WITH wikidata_ids AS (
    SELECT page_id,
           item_id
      FROM wmf.wikidata_item_page_link wd
     WHERE wd.snapshot = '{wd_snapshot}'
           AND wd.page_namespace = 0
           AND wiki_db = 'enwiki'
)
SELECT item_id,
       getCleanedText(revision_text) as cleaned_wikitext
  FROM wmf.mediawiki_wikitext_current wt
 INNER JOIN wikidata_ids wd
       ON (wt.page_id = wd.page_id)
 WHERE snapshot = '{snapshot}'
       AND wiki_db = 'enwiki'
       AND page_namespace = 0
"""

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)
    result.write.csv(path="/user/isaacj/enwiki-cleaned-wikitext", compression="bzip2", header=True, sep="\t")


WITH wikidata_ids AS (
    SELECT page_id,
           item_id
      FROM wmf.wikidata_item_page_link wd
     WHERE wd.snapshot = '2020-12-07'
           AND wd.page_namespace = 0
           AND wiki_db = 'enwiki'
)
SELECT item_id,
       getCleanedText(revision_text) as cleaned_wikitext
  FROM wmf.mediawiki_wikitext_current wt
 INNER JOIN wikidata_ids wd
       ON (wt.page_id = wd.page_id)
 WHERE snapshot = '2020-12'
       AND wiki_db = 'enwiki'
       AND page_namespace = 0



## Pull from HDFS to local

In [41]:
file_parts_dir = './text_file_parts/'
!rm -R {file_parts_dir}
!mkdir {file_parts_dir}
!hdfs dfs -copyToLocal enwiki-cleaned-wikitext/part* {file_parts_dir}

rm: cannot remove './text_file_parts/': No such file or directory


## Add labels and train/test split

In [42]:
base_fasttext_fn = './fasttext/wt_2020_12.txt'
groundtruth_data = 'labeled_enwiki_with_topics_metadata.json.bz2'

train_prop = 0.9
val_prop = 0.02
test_prop = 0.08
assert train_prop + val_prop + test_prop == 1
train_fn = base_fasttext_fn.replace('.txt', '_train.txt')
train_metadata_fn = base_fasttext_fn.replace('.txt', '_train_metadata.txt')
val_fn = base_fasttext_fn.replace('.txt', '_val.txt')
val_metadata_fn = base_fasttext_fn.replace('.txt', '_val_metadata.txt')
test_fn = base_fasttext_fn.replace('.txt', '_test.txt')
test_metadata_fn = base_fasttext_fn.replace('.txt', '_test_metadata.txt')
nogroundtruth_fn = base_fasttext_fn.replace('.txt', '_nogt.txt')
nogroundtruth_metadata_fn = base_fasttext_fn.replace('.txt', '_nogt_metadata.txt')

In [43]:
def fasttextify(topic):
    """Translate articletopic labels into fastText format (prefixed with __label__ and no spaces)."""
    return '__label__{0}'.format(topic.replace(' ', '_'))

In [46]:
# load in groundtruth
qid_topics = {}
with bz2.open(groundtruth_data, 'rt') as fin:
    for line in fin:
        line = json.loads(line)
        qid = line.get('qid')
        topics = line.get('topics')
        if qid and topics:
            qid_topics[qid] = topics
print("{0} QIDs with topics.".format(len(qid_topics)))

5662388 QIDs with topics.


In [52]:
train_written = 0
val_written = 0
test_written = 0
nogt_written = 0
i = 0
qids_to_split = {}
input_header = ['item_id', 'cleaned_wikitext']
fns = [fn for fn in os.listdir(file_parts_dir) if fn.endswith('.csv.bz2')]
with open(train_fn, 'w') as train_fout:
    with open(train_metadata_fn, 'w') as train_metadata_fout:
        with open(val_fn, 'w') as val_fout:
            with open(val_metadata_fn, 'w') as val_metadata_fout:
                with open(test_fn, 'w') as test_fout:
                    with open(test_metadata_fn, 'w') as test_metadata_fout:
                        with open(nogroundtruth_fn, 'w') as nogt_fout:
                            with open(nogroundtruth_metadata_fn, 'w') as nogt_metadata_fout:
                                for fidx, fn in enumerate(fns, start=1):
                                    with bz2.open(os.path.join(file_parts_dir, fn), 'rt') as fin:
                                        header = next(fin).strip().split('\t')
                                        assert header == input_header
                                        for i, line_str in enumerate(fin, start=1):
                                            line = line_str.strip().split('\t')
                                            assert len(line) == len(input_header)
                                            qid = line[0]
                                            wikitext = line[1]
                                            if not wikitext or not qid:
                                                continue
                                            topics = qid_topics.get(qid)
                                            if topics:
                                                if qid in qids_to_split:
                                                    r = qids_to_split[qid]
                                                else:
                                                    r = random.random()
                                                    qids_to_split[qid] = r
                                                if r <= train_prop:
                                                    data_fout = train_fout
                                                    metadata_fout = train_metadata_fout
                                                    train_written += 1
                                                elif r <= train_prop + val_prop:
                                                    data_fout = val_fout
                                                    metadata_fout = val_metadata_fout
                                                    val_written += 1
                                                else:
                                                    data_fout = test_fout
                                                    metadata_fout = test_metadata_fout
                                                    test_written += 1
                                            else:
                                                topics = []
                                                data_fout = nogt_fout
                                                metadata_fout = nogt_metadata_fout
                                                nogt_written += 1
                                            data_fout.write('{0} {1}\n'.format(' '.join([fasttextify(t) for t in topics]), wikitext))
                                            metadata_fout.write('{0}\n'.format(qid))
                                    print("{0} of {1} processed: {2} train. {3} val. {4} test. {5} no groundtruth.".format(fidx, len(fns),
                                                                                                                           train_written,
                                                                                                                           val_written,
                                                                                                                           test_written,
                                                                                                                           nogt_written))

1 of 256 processed: 19895 train. 454 val. 1752 test. 2671 no groundtruth.
2 of 256 processed: 39897 train. 888 val. 3540 test. 5335 no groundtruth.
3 of 256 processed: 59612 train. 1312 val. 5315 test. 7899 no groundtruth.
4 of 256 processed: 79527 train. 1742 val. 7028 test. 10576 no groundtruth.
5 of 256 processed: 99350 train. 2175 val. 8774 test. 13305 no groundtruth.
6 of 256 processed: 119147 train. 2597 val. 10529 test. 15945 no groundtruth.
7 of 256 processed: 139220 train. 3022 val. 12307 test. 18566 no groundtruth.
8 of 256 processed: 159137 train. 3468 val. 14084 test. 21133 no groundtruth.
9 of 256 processed: 179111 train. 3926 val. 15818 test. 23712 no groundtruth.
10 of 256 processed: 198857 train. 4347 val. 17615 test. 26346 no groundtruth.
11 of 256 processed: 218957 train. 4815 val. 19382 test. 28944 no groundtruth.
12 of 256 processed: 238746 train. 5243 val. 21114 test. 31648 no groundtruth.
13 of 256 processed: 258463 train. 5703 val. 22903 test. 34254 no groundtrut

In [53]:
!ls -lht /home/isaacj/fasttext/

total 20G
-rw-r--r-- 1 isaacj wikidev   45M Jan 28 03:08 wt_2020_12_train_metadata.txt
-rw-r--r-- 1 isaacj wikidev   17G Jan 28 03:08 wt_2020_12_train.txt
-rw-r--r-- 1 isaacj wikidev  385M Jan 28 03:08 wt_2020_12_val.txt
-rw-r--r-- 1 isaacj wikidev 1012K Jan 28 03:08 wt_2020_12_val_metadata.txt
-rw-r--r-- 1 isaacj wikidev  6.0M Jan 28 03:08 wt_2020_12_nogt_metadata.txt
-rw-r--r-- 1 isaacj wikidev  978M Jan 28 03:08 wt_2020_12_nogt.txt
-rw-r--r-- 1 isaacj wikidev  4.0M Jan 28 03:08 wt_2020_12_test_metadata.txt
-rw-r--r-- 1 isaacj wikidev  1.5G Jan 28 03:08 wt_2020_12_test.txt
