In [None]:
import json
import logging
import time
import os.path
import sys
from IPython.display import clear_output
from matplotlib import pyplot as plt
import msgpack
import sqlalchemy
from sqlalchemy import desc, func, and_
from sqlalchemy.orm import sessionmaker

In [None]:
import database
from database.schema import ArticleContributor, contributor_table, revision_table

In [None]:
exp_name="17_create_coeditor"
project_file = "data/projects-2016-10-14-dedup.json"
affiliation_file = "%d-article-editor.tsv"
coeditor_file = "%d-coeditor.tsv"
coeditor_mp = "%d-coeditor.mp"
batch_size = 50000

In [None]:
import logbook

In [None]:
project_ids = []
with open(project_file, "rb") as f:
    for row in f:
        data = json.loads(row)
        project_ids.append(data["project_id"])

In [None]:
# Load project data into articles_contributors
def compute_articles_contributors(project_id, log):
    start = time.time()
    Session = sessionmaker()
    Session.configure(bind=database.engine)
    session = Session()
    try:
        revisions = revision_table(project_id)
        row_count = 0
        to_add = []
        log.info("Computing articles_contributors for project %d" % project_id)
        for result in session.query(
                revisions.article_id,
                revisions.contributor_id,
                func.min(revisions.timestamp),
                func.max(revisions.timestamp)) \
                .group_by(revisions.article_id, revisions.contributor_id) \
                .filter(and_(
                    revisions.contributor_id != 0,
                    revisions.contributor_id != None)):
            to_add.append(ArticleContributor(
                article_id=result[0]
                , contributor_id=result[1]
                , first_edit=result[2]
                , last_edit=result[3]))
            row_count += 1
            if len(to_add) >= batch_size:
                session.add_all(to_add)
                session.commit()
                to_add = []
                log.info("%d rows in %f seconds" % (row_count, time.time() - start))
                time.sleep(0.1)
        session.add_all(to_add)
        session.commit()
        to_add = []
        log.info("%d rows in %f seconds" % (row_count, time.time() - start))
    except:
        log.error(sys.exc_info()[1])
        raise
    finally:
        session.close()
        log.info("Finished computing project %d articles_contributors %d rows in %f seconds" % (
            project_id, row_count, time.time() - start))

In [None]:
# Create indexes on articles_contributors
def index_articles_contributors(project_id, log):
    start = time.time()
    log.info("Indexing articles_contributors")
    sql = "CREATE INDEX last ON articles_contributors (last_edit);"
    database.engine.execute(sql)
    sql = "CREATE INDEX contributor ON articles_contributors (contributor_id);"
    database.engine.execute(sql)
    log.info("Finished indexing project %d articles_contributors. %f seconds" % (
            project_id, time.time() - start))

In [None]:
# Clear articles_contributors
def clear_articles_contributors(log):
    log.info("Removing indexes from articles_contributors")
    sql = "DROP INDEX last ON articles_contributors;"
    database.engine.execute(sql)
    sql = "DROP INDEX contributor ON articles_contributors;"
    database.engine.execute(sql)
    log.info("Truncating articles_contributors")
    sql = "TRUNCATE articles_contributors"
    database.engine.execute(sql)
    log.info("Done clearing articles_contributors")

In [None]:
# Clear <project_id>_contributor_contributor
def clear_coeditor(project_id, log):
    log.info("Truncating %d_contributor_contributor" % project_id)
    sql = "TRUNCATE %d_contributor_contributor" % project_id
    database.engine.execute(sql)
    log.info("Done clearing %d_contributor_contributor" % project_id)

In [None]:
# Write articles_contributors as a graph edge TSV
def write_articles_contributors_tsv(project_id, exp):
    start = time.time()
    log = exp.get_logger()
    log.info("Writing article-editor TSV for project %d" % project_id)
    Session = sessionmaker()
    Session.configure(bind=database.engine)
    session = Session()
    row_count = 0
    try:
        fname = exp.get_filename(affiliation_file % project_id)
        with open(fname, "wb") as f:
            # Use yield_per to have sqlalchemy internally batch the results.
            # This also requires setting .enable_eagerloads to False.
            q = session.query(ArticleContributor) \
                .yield_per(10000) \
                .enable_eagerloads(False)
            for result in q:
                f.write((u"%d\t%s\n" % (result.contributor_id, result.article_id)).encode("utf-16-le"))
                row_count += 1
    finally:
        session.close()
        log.info("Finished writing article-editor TSV for project %d. %d rows in %f seconds" % (
            project_id, row_count, time.time() - start))

In [None]:
# Write <project_id>_contributor_contributor as a graph edge TSV
def write_coeditor_tsv(project_id, exp):
    start = time.time()
    log = exp.get_logger()
    log.info("Writing coeditor TSV for project %d" % project_id)
    Session = sessionmaker()
    Session.configure(bind=database.engine)
    session = Session()
    row_count = 0
    try:
        fname = exp.get_filename(coeditor_file % project_id)
        with open(fname, "wb") as f:
            table = contributor_table(project_id)
            # Use yield_per to have sqlalchemy internally batch the results.
            # This also requires setting .enable_eagerloads to False.
            q = session.query(table) \
                .yield_per(10000) \
                .enable_eagerloads(False)
            f.write(u"source_id\ttarget_id\n".encode("utf-16-le"))
            for result in q:
                f.write((u"%d\t%d\n" % (result.source_id, result.target_id)).encode("utf-16-le"))
                row_count += 1
    finally:
        session.close()
        log.info("Finished writing coeditor TSV for project %d. %d rows in %f seconds" % (
            project_id, row_count, time.time() - start))

In [None]:
# Create an editor-editor network from articles_contributors
# First, get all contributors to use as source nodes.
# Then get keep track of a list of revisions by each contributor
# and a list of all contributions by article, sorted by time.
# For each (contributor, article) go through all contributions for that article after
# the contributor's first edit, and add a directed link.
def compute_coeditor(project_id, log, batch_size=100000):
    log.info("Computing coeditor network for project %d" % project_id)
    start = time.time()
    Session = sessionmaker()
    Session.configure(bind=database.engine)
    session = Session()
    conn = database.engine.connect()
    edges_complete = 0
    edges_to_insert = list()
    mp = open(exp.get_filename(coeditor_mp % project_id), "wb")
    try:
        # Map contributors to articles and first/last edits
        ac_by_contributor = {}
        # Map articles to a list of contributors sorted by last edit
        ac_sorted_last_by_article = {}
        # Get all contributors in this project
        log.info("  Querying sources from articles_contributors");
        sources = session.query(ArticleContributor.contributor_id).distinct().all()
        # Populate maps
        log.info("  Querying data from articles_contributors");
        ac_q = session.query(
                ArticleContributor.contributor_id,
                ArticleContributor.article_id,
                ArticleContributor.first_edit,
                ArticleContributor.last_edit) \
            .order_by(desc(ArticleContributor.last_edit))
        ac_all = ac_q.all()
        log.info("  Creating article/contributor lookup tables");
        for row in ac_all:
            try:
                ac_by_contributor[int(row[0])].append(row)
            except KeyError:
                ac_by_contributor[int(row[0])] = [row]
            try:
                ac_sorted_last_by_article[row[1]].append(row)
            except KeyError:
                ac_sorted_last_by_article[row[1]] = [row]
        log.info("  Creating edges for each source");
        for i, source in enumerate(sources):
            source_id = int(source[0])
            source_edges = set()
            for source_ac in ac_by_contributor[source_id]:
                article_id = source_ac[1]
                source_first_edit = source_ac[2]
                for target_ac in ac_sorted_last_by_article[article_id]:
                    target_last_edit = target_ac[3]
                    if source_first_edit >= target_last_edit:
                        # The targets are sorted in descending order of last edit.
                        # Any later rows will be earlier so we can go to the next article
                        break
                    target_id = int(target_ac[0])
                    if target_id != source_id:
                        source_edges.add( (source_id, target_id) )
            # Write to MessagePack file
            target_list = [edge[1] for edge in source_edges]
            mp_row = (source, target_list)
            mp.write(msgpack.packb(mp_row))
            del target_list
            del mp_row
            # Load edges into contributor_contributor in batches
            coeditors = contributor_table(project_id).__table__
            source_edge_list = list(source_edges)
            edges_to_insert = edges_to_insert + source_edge_list
            total_edges = len(edges_to_insert)
            while len(edges_to_insert) > batch_size:
                log.info("  Inserting edges (%d), %d sources complete" % (total_edges, i))
                batch = edges_to_insert[0:batch_size]
                batch_data = []
                for edge in batch:
                    batch_data.append({"source_id": edge[0], "target_id": edge[1]})
                conn.execute(coeditors.insert(), batch_data)
                edges_to_insert = edges_to_insert[batch_size:]
                edges_complete += len(batch_data)
        # Insert remaining edges
        total_edges = len(edges_to_insert)
        while len(edges_to_insert) > 0:
            log.info("  Inserting remaining edges")
            batch = edges_to_insert[0:batch_size]
            batch_data = []
            for edge in batch:
                batch_data.append({"source_id": edge[0], "target_id": edge[1]})
            conn.execute(coeditors.insert(), batch_data)
            edges_to_insert = edges_to_insert[batch_size:]
            edges_complete += len(batch_data)
        log.info("Finished computing coeditor network for project %d. %d in %f seconds" % (
            project_id, edges_complete, time.time() - start))
    except:
        log.error(sys.exc_info())
        raise
    finally:
        session.close()
        conn.close()
        mp.close()

In [None]:
# Create indexes on contributors_contributors
def index_coeditor(project_id, log):
    start = time.time()
    log.info("Indexing coeditor table")
    sql = "CREATE INDEX cc%d_edge ON %d_contributor_contributor (source_id, target_id);" % (
        project_id, project_id)
    database.engine.execute(sql)
    log.info("Done indexing coeditor table for project %d. %f seconds" % (
        project_id, time.time() - start))

In [None]:
cleanup = None
skip_to = 1
exp = logbook.Experiment(exp_name)
log = exp.get_logger()

In [None]:
try:
    #index_articles_contributors(cleanup, log)
    #write_articles_contributors_tsv(cleanup, log)
    #compute_coeditor(cleanup, log)
    #clear_articles_contributors(log)
    for project_id in project_ids:
        if project_id < skip_to:
            continue
        compute_articles_contributors(project_id, log)
        index_articles_contributors(project_id, log)
        compute_coeditor(project_id, log)
        clear_articles_contributors(log)
except:
    log.error("error: %s" % str(sys.exc_info()))
    raise