In [None]:
import json
import requests
import time
from sqlalchemy import func, select, sql
import database
from database.schema import revision_table
import logbook

In [None]:
project_file = "data/projects-2016-10-14-dedup.json"
quality_url = "https://ores.wikimedia.org/scores/enwiki/?models=wp10&revids=%s"
exp_name = "11b_find_bga_perf"
project_out_file = "%04d-ga.csv"
stats_out_file = "project_bga.csv"
revisions_per_request = 100
request_delay = 0.1

In [None]:
classes_c = set([
    "C-Class"
])
classes_start = set([
    "",
    "Stub-Class",
    "Start-Class",
    "Draft-Class",
    "Merge-Class",
    "Needed-Class",
    "Unassessed",
    "Unassessed-Class",
    "Current-Class",
    "Future-Class"
])
classes_below = classes_c | classes_start

In [None]:
exp = logbook.Experiment(exp_name)
log = exp.get_logger()

In [None]:
log.info("Loading project info")
project_names = {}
with open(project_file, "rb") as f:
    for row in f:
        d = json.loads(row)
        project_names[d["project_id"]] = d["project_name"]
log.info("Project info loaded")

In [None]:
ratings = database.schema.Rating.__table__
conn = database.engine.connect()
try:
    with open(exp.get_filename(stats_out_file), "wb") as stats_out:
        stats_out.write("ga_mean\n")
        for project_id, project_name in project_names.iteritems():
            log.info("Starting project %d: %s" % (project_id, project_name))
            revisions = revision_table(project_id).__table__
            project_revisions = []
            project_ga = []
            # Get all transitions to B
            log.info("  Querying for B transitions")
            stmt = select([
                ratings.c.article_id,
                ratings.c.timestamp,
                ratings.c.old_quality]) \
                .where(sql.and_(
                    ratings.c.project_id == project_id,
                    sql.and_(
                        ratings.c.article_id != None,
                        ratings.c.new_quality == "B-Class")))
            results = conn.execute(stmt)
            log.info("  Processing %d query results" % results.rowcount)
            for row in results:
                article_id = int(row[0])
                timestamp = int(row[1])
                old_quality = row[2]
                if old_quality not in classes_below:
                    continue
                # Get last revision before timestamp
                stmt = select([revisions.c.revision_id, func.max(revisions.c.timestamp)]) \
                    .where(sql.and_(
                        revisions.c.article_id == article_id,
                        revisions.c.timestamp <= timestamp))
                rev_result = conn.execute(stmt)
                revision_id = str(rev_result.fetchone()[0])
                if revision_id != "None":
                    project_revisions.append(revision_id)
            # Get quality evaluations for each revision
            log.info("  Fetching revision quality estimates")
            with open(exp.get_filename(project_out_file % project_id), "wb") as p_out:
                sofar = 0
                totes = len(project_revisions)
                while len(project_revisions) > 0:
                    log.info("    Starting %d of %d" % (sofar+1,totes))
                    chunk = project_revisions[-revisions_per_request:]
                    project_revisions = project_revisions[0:-revisions_per_request]
                    sofar += len(chunk)
                    url = quality_url % "|".join(chunk)
                    result = json.loads(requests.get(url).content)
                    for revid, data in result.iteritems():
                        try:
                            ga = data["wp10"]["probability"]["GA"]
                        except KeyError:
                            log.warning("    Skipping revision %s" % revid)
                        project_ga.append(ga)
                        p_out.write("%s,%s\n" % (revid, repr(ga)))
                    time.sleep(request_delay)
            # Summarize
            log.info("  Summarizing project results")
            if len(project_ga) > 0:
                ga_mean = sum(project_ga) / float(len(project_ga))
                stats_out.write("%d,%s\n" % (project_id, repr(ga_mean)))
    log.info("Completed successfully")
finally:
    conn.close()

In [None]:
result