In [None]:
import codecs
import datetime
import dateutil
import msgpack
from sqlalchemy import select
import database
from database.schema import article_name_id
import logbook

In [None]:
infile = "output/02_sort_rename_map/2016-10-12 13:21:44 a340754/sorted_rename_map.m"
skipfile = "skipped.utf-16-le.tsv"
dumptime = dateutil.parser.parse("2016-09-20T00:00:00Z")
starttime = dateutil.parser.parse("2001-01-15T00:00:00Z")
exp_name = "03_create_name_id_map"
exp = logbook.Experiment(exp_name)
log = exp.get_logger()

In [None]:
exclude_namespaces = set([
    'User', 'User talk',
    'Wikipedia', 'Wikipedia talk',
    'WP', 'WP talk',
    'Project', 'Project talk',
    'File', 'File talk',
    'Image', 'Image talk',
    'MediaWiki', 'MediaWiki talk',
    'Template', 'Template talk',
    'Help', 'Help talk',
    'Category', 'Category talk',
    'Portal', 'Portal talk',
    'Book', 'Book talk',
    'Draft', 'Draft talk',
    'Education Program', 'Education Program talk',
    'TimedText', 'TimedText talk',
    'Module', 'Module talk',
    'Topic', 'Topic talk',
    'Gadget', 'Gadget talk',
    'Special', 'Special talk',
    'Media', 'Media talk'
])

In [None]:
log.info("Reading rename map")
with open(infile, "rb") as f:
    rename_map = msgpack.unpackb(f.read())
log.info("  Done")

In [None]:
log.info("Decoding unicode in rename map")
for i in range(len(rename_map["timestamp"])):
    rename_map["old_name"][i] = rename_map["old_name"][i].decode("utf-8")
    rename_map["new_name"][i] = rename_map["new_name"][i].decode("utf-8")
    # No need to decode timestamp because it's in ascii
log.info("  Done")

In [None]:
log.info("Querying database for page ids")
conn = database.engine.connect()
page_table = database.schema.WP_Page.__table__
stmt = select([
        page_table.c.page_id,
        page_table.c.page_namespace,
        page_table.c.page_title],) \
    .where(page_table.c.page_namespace.in_([0,1]))
result = conn.execute(stmt)

In [None]:
log.info("  Processing query results")
pending = {}
# Keep track of ending date of name-id pairs
# For current names use date of dump
for row in result:
    page_id = row[0]
    page_namespace = row[1]
    page_title = row[2].decode("utf-8").replace(u'_', u' ')
    if page_namespace == 1:
        title = u"Talk:%s" % page_title
    else:
        title = page_title
    pending[title] = (page_id, dumptime)
conn.close()
log.info("  Done")

In [None]:
try:
    log.info("Tracing current articles back in time")
    conn = database.engine.connect()
    name_id_from_to = []
    count = len(rename_map["timestamp"])
    with codecs.open(exp.get_filename(skipfile), "w", encoding="utf-16-le") as skipped:
        # Iterate in reverse through rename history
        for i in range(count):
            j = count - i - 1
            old = rename_map["old_name"][j]
            new = rename_map["new_name"][j]
            ts_from = dateutil.parser.parse(rename_map["timestamp"][j])
            if ts_from > dumptime:
                continue
            try:
                # Get id of renamed article
                page_id, ts_to = pending[new]
            except KeyError:
                if (new.split(u":")[0] in exclude_namespaces):
                    # Not article or talk
                    continue
                skipped.write(
                    "%s\t%s\t%s\n" %
                    (ts_from.isoformat(), old, new))
                continue
            # Older name is now pending
            del pending[new]
            pending[old] = (page_id, ts_from)
            # Save entry
            name_id_from_to.append({
                "article_name": new,
                "article_id": page_id,
                "from_ts": ts_from,
                "to_ts": ts_to})
            if len(name_id_from_to) > 10000:
                log.info("  Inserting batch")
                stmt = article_name_id.insert()
                conn.execute(stmt.values(name_id_from_to))
                name_id_from_to = []
                log.info("  %d/%d processed" % (i, count))
        log.info("  Adding entries for original article names")
        pending_count = len(pending)
        count = 0
        for title, info in pending.iteritems():
            page_id, ts_to = info
            name_id_from_to.append({
                "article_name": title,
                "article_id": page_id,
                "from_ts": starttime,
                "to_ts": ts_to})
            count += 1
            if len(name_id_from_to) > 10000:
                log.info("    Inserting batch")
                stmt = article_name_id.insert()
                conn.execute(stmt.values(name_id_from_to))
                name_id_from_to = []
                log.info("    %d of %d complete" % (count, pending_count))
        log.info("  Inserting final batch")
        if len(name_id_from_to) > 0:
                stmt = article_name_id.insert()
                conn.execute(stmt.values(name_id_from_to))
                name_id_from_to = []
                log.info("  %d/%d processed" % (i, count)) 
finally:
    conn.close()
log.info("  Done")