In [1]:
import calendar
import codecs
import datetime
import dateutil
import json
import sys
import msgpack
from sqlalchemy import and_, select
import database
import logbook

In [2]:
exp_name = "07_create_article_project_map"
project_file = "data/projects-2016-10-14.json"
skip_assess_file = "skipped_assessments.tsv"
skip_timespan_file = "skipped_timespans.tsv"
out_file = "articles_projects.m"
dump_dt = 1459468800 # "2016-04-01T00:00:00Z"
exp = logbook.Experiment(exp_name)

In [3]:
project_names = {}
with open(project_file, "rb") as f:
    for row in f:
        d = json.loads(row)
        project_names[d["project_id"]] = d["project_name"]

In [4]:
def get_article_dates(project_id, conn, skipped):
    '''Return list of (article_name_utf8, start_time, end_time).'''
    tab = database.schema.Rating.__table__
    stmt = select([
            tab.c.action,
            tab.c.timestamp,
            tab.c.article_name,
            tab.c.new_article_name]) \
        .where(tab.c.project_id == project_id) \
        .order_by(tab.c.timestamp)
    result = conn.execute(stmt)
    article_start = {}
    names_dates = []
    for row in result:
        action_utf8, timestamp, article_name_utf8, new_article_name_utf8 = row
        talk_name_utf8 = "Talk:%s" % article_name_utf8
        new_talk_name_utf8 = "Talk:%s" % article_name_utf8
        if action_utf8 == "Renamed":
            # Track the article's name change
            try:
                # Get timestamp for start of previous name
                start = article_start[article_name_utf8]
                del article_start[article_name_utf8]
            except KeyError:
                skipped.write(u"\t".join([str(x).decode('utf-8') for x in row]) + u"\n")
                continue
            names_dates.append( (article_name_utf8, start, timestamp) )
            article_start[new_article_name_utf8] = timestamp
            # Also track talk page
            try:
                # Get timestamp for start of previous name
                talk_start = article_start[talk_name_utf8]
                del article_start[talk_name_utf8]
            except KeyError:
                talk_data = [action_utf8, timestamp, talk_name_utf8, new_talk_name_utf8]
                skipped.write(u"\t".join([str(x).decode('utf-8') for x in talk_data]) + u"\n")
            names_dates.append( (talk_name_utf8, start, timestamp) )
            article_start[new_talk_name_utf8] = timestamp
        elif action_utf8 == "Removed":
            # Track article removal
            try:
                # Get timestamp for start of previous name
                start = article_start[article_name_utf8]
                del article_start[article_name_utf8]
            except KeyError:
                skipped.write(u"\t".join([str(x).decode('utf8') for x in row]) + u"\n")
                continue
            names_dates.append( (article_name_utf8, start, timestamp) )
            # Also track talk page
            try:
                # Get timestamp for start of previous name
                talk_start = article_start[talk_name_utf8]
                del article_start[talk_name_utf8]
            except KeyError:
                talk_data = [action_utf8, timestamp, talk_name_utf8, new_talk_name_utf8]
                skipped.write(u"\t".join([str(x).decode('utf-8') for x in talk_data]) + u"\n")
            names_dates.append( (talk_name_utf8, start, timestamp) )
        elif article_name_utf8 not in article_start:
            article_start[article_name_utf8] = timestamp
            article_start[talk_name_utf8] = timestamp
    # If articles are started but not renamed or removed, use the dump date
    for article_name_utf8, start in article_start.iteritems():
        names_dates.append( (article_name_utf8, start, dump_dt) )
    return names_dates

In [5]:
def get_id_dates(names_dates, conn, skipped):
    '''Get article_id for each article_name and timespan.
    Returns {id => [from_ts1, to_ts1, from_ts2, from_ts2, ...]}.
    '''
    tab = database.schema.article_name_id
    id_dates = {}
    for row in names_dates:
        article_name_utf8, from_ts, to_ts = row
        # Use midpoint to look up id in case ends are mismatched
        mid_ts = (from_ts + to_ts) / 2
        mid_dt = datetime.datetime.utcfromtimestamp(mid_ts)
        stmt = select([tab.c.article_id]) \
            .where(and_(
                tab.c.article_name == article_name_utf8,
                tab.c.from_ts < mid_dt,
                tab.c.to_ts > mid_dt))
        result = conn.execute(stmt)
        try:
            article_id = result.fetchone()[0]
        except TypeError:
            skipped.write(u"\t".join([str(x).decode('utf-8') for x in row]) + u"\n")
            continue
        try:
            id_dates[article_id].append(from_ts)
            id_dates[article_id].append(to_ts)
        except KeyError:
            id_dates[article_id] = [from_ts, to_ts]
    return id_dates

In [6]:
log = exp.get_logger()
log.info("Beginning")
skipped_assess = codecs.open(exp.get_filename(skip_assess_file), "w", encoding="utf-16-le")
skipped_timespan = codecs.open(exp.get_filename(skip_timespan_file), "w", encoding="utf-16-le")
conn = database.engine.connect()
try:
    #id_projects = {article_id: { project_id: [start, stop] } }
    id_projects = {}
    # Iterate through projects
    for project_id in project_names.iterkeys():
        log.info(u"Project %d: %s" % (project_id, project_names[project_id]))
        log.info("  Finding timespans")
        # Get all article names in the project and the timespans they were in the project
        # with that name
        names_dates = get_article_dates(project_id, conn, skipped_assess)
        log.info("  Finding ids")
        # Get article ids and the list of timespans they were in the project
        id_dates = get_id_dates(names_dates, conn, skipped_timespan)
        log.info("  Combining ids")
        # Iterate through id -> timespan entries
        # Combine multiple timespans into single
        for article_id, article_dates in id_dates.iteritems():
            start = min(article_dates)
            end = max(article_dates)
            # Get existing list of projects 
            try:
                project_times = id_projects[article_id]
            except KeyError:
                project_times = {}
                id_projects[article_id] = project_times
            project_times[project_id] = [start, end]
    log.info("Writing output")
    with open(exp.get_filename(out_file), "wb") as f:
        f.write(msgpack.packb(id_projects))
    log.info("Complete, cleaning up...")
except:
    log.error(str(sys.exc_info()))
    raise
finally:
    skipped_assess.close()
    skipped_timespan.close()
    conn.close()

No handlers could be found for logger "sqlalchemy.pool.QueuePool"


KeyboardInterrupt: 