In [None]:
import json
import time
import sys
import sqlalchemy
from sqlalchemy import func, select
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import and_

In [None]:
import database
from database.schema import article_name_id, Rating, revision_table

In [None]:
exp_name="10_find_efficiency"
project_file = 'data/projects-2016-10-14-dedup.json'
map_file = "output/07_create_article_project_map/2017-03-27 14:10:49 77c76e1/articles_projects.m"
output_file = "efficiency.csv"
skip_to = 0

In [None]:
import logbook
exp = logbook.Experiment(exp_name)
log = exp.get_logger()
log.info("Beginning...")

In [None]:
log.info("Loading project json")
project_names = {}
with open(project_file, "rb") as f:
    for row in f:
        datum = json.loads(row)
        project_names[datum["project_id"]] = datum["project_name"]
log.info("Done loading project json")

In [None]:
log.info("Connecting to database")
conn = database.engine.connect()
data = []
classes_a = set([
    "FA-Class",
    "GA-Class",
    "A-Class"
])
classes_b = set([
    "Bplus-Class",
    "B-Class"
])
classes_c = set([
    "C-Class"
])
classes_start = set([
    "",
    "Stub-Class",
    "Start-Class",
    "Draft-Class",
    "Merge-Class",
    "Needed-Class",
    "Unassessed",
    "Unassessed-Class",
    "Current-Class",
    "Future-Class"
])
classes_ab = classes_a | classes_b
classes_abc = classes_ab | classes_c
classes_other = set([
    "NotA-Class",
    "List-Class",
    "FL-Class",
    "AL-Class",
    "BL-Class",
    "CL-Class",
    "SL-Class",
    "FM-Class",
    "Category-Class",
    "Disambig-Class",
    "Portal-Class",
    "Template-Class",
    "Project-Class",
    "Book-Class",
    "File-Class",
    "Image-Class",
    "Redirect-Class",
    "NA-Class"
])
classes = classes_abc | classes_start | classes_other
out = open(exp.get_filename(output_file), "wb")
out.write(",".join(
    ['project_id', 'Delta_a', 'Delta_b', 'Delta_c', 'N_a', 'N_b', 'N_c']) + "\n")
try:
    log.info("Calculating efficiency for projects starting with: %d" % skip_to)
    data = []
    for project_id, project_name in project_names.iteritems():
        if project_id < skip_to:
            continue
        log.info(
            "  Calculating efficiency for project: %d:%s" %
            (project_id, project_name))
        ratings = {}
        rating_times = {}
        errors = []
        ignore_article_id = set()
        unknown_name_count = 0
        # The parameters below are defined in the paper
        Delta_a = 0
        Delta_b = 0
        Delta_c = 0
        N_a = 0
        N_b = 0
        N_c = 0
        log.info("  Opening table: %d_revisions" % project_id)
        revisions = revision_table(project_id).__table__
        log.info("  Querying for rating changes")
        # Join the rating table to get the id of the article from it's name/time
        table = Rating.__table__
        stmt = select([
                table.c.internal_id,
                table.c.action,
                table.c.article_name,
                table.c.old_quality,
                table.c.new_quality,
                table.c.new_article_name,
                table.c.timestamp,
                table.c.article_id,
                table.c.talk_id]) \
            .where(and_(
                table.c.project_id == project_id,
                table.c.action.in_(
                    ["Assessed", "Reassessed", "Renamed", "Removed"]))) \
            .order_by(table.c.timestamp)
        result = conn.execute(stmt)
        proj_count = result.rowcount
        log.info("  Processing %d query results" % proj_count)
        for i, row in enumerate(result):
            if i > 0 and i % 10000 == 0:
                #if len(errors) > 0:
                    #for error in errors:
                        #log.warning(error)
                    #errors = []
                log.info("    %d of %d rows complete" % (i, proj_count))
                time.sleep(0.1)
            action = row[1]
            article_name = row[2]
            timestamp = row[6]
            article_id = row[7]
            talk_id = row[8]
            # Check whether we're ignoring this article in this project
            if article_id in ignore_article_id:
                continue
            # Create list of main and (if it exists) talk page id
            search_ids = []
            if article_id is not None and article_id > 0:
                search_ids.append(article_id)
            if talk_id is not None and talk_id > 0:
                search_ids.append(talk_id)
            if len(search_ids) == 0:
                # Can't identify article name
                unknown_name_count += 1
                continue
            # Check the action
            if action in ["Assessed", "Reassessed"]:
                # Could be rating or importance
                old_rating = row[3]
                new_rating = row[4]
                if old_rating == '' and (new_rating == "GA-Class" or new_rating == "FA-Class") :
                    # Added to project after already complete, ignore
                    ignore_article_id.add(article_id)
                    continue
                if new_rating == old_rating:
                    # No grade transition, keep going until we find one
                    continue
                try:
                    old_rating = ratings[article_name]["rating"]
                except KeyError:
                    old_rating = ''
                a_transition = 0
                b_transition = 0
                c_transition = 0
                if new_rating in classes_a and old_rating not in classes_a:
                    a_transition = 1
                elif old_rating in classes_a and new_rating not in classes_a:
                    a_transition = -1
                if new_rating in classes_ab and old_rating not in classes_ab:
                    b_transition = 1
                elif old_rating in classes_ab and new_rating not in classes_ab:
                    b_transition = -1
                if new_rating in classes_abc and old_rating not in classes_abc:
                    c_transition = 1
                elif old_rating in classes_abc and new_rating not in classes_abc:
                    c_transition = -1
                if a_transition == 0 and b_transition == 0 and c_transition == 0:
                    try:
                        ratings[article_name]["rating"] = new_rating
                    except KeyError:
                            ratings[article_name] = {
                                "rating": new_rating, "timestamp": timestamp }
                    continue
                # There has been a transition, record it and count the revisions
                Delta_a += a_transition
                Delta_b += b_transition
                Delta_c += c_transition
                
                # Get revisions based on article id and timestamp
                stmt_where = and_(
                    revisions.c.article_id.in_(search_ids),
                    revisions.c.timestamp <= timestamp)
                try:
                    old_timestamp = ratings[article_name]["timestamp"]
                    stmt_where = and_(stmt_where,
                        revisions.c.timestamp > old_timestamp)
                except KeyError:
                    pass
                stmt = select([func.count(revisions.c.internal_id)]) \
                    .where(stmt_where)
                result = conn.execute(stmt)
                try:
                    count = result.fetchone()[0]
                except StopIteration:
                    count = 0
                # N_g counts the number of revisions that occurred below rating `g`
                if old_rating in classes_b:
                    N_a += count
                elif old_rating in classes_c:
                    N_a += count
                    N_b += count
                elif old_rating in classes_start:
                    N_a += count
                    N_b += count
                    N_c += count
                ratings[article_name] = {
                    "rating": new_rating, "timestamp": timestamp }
            elif row[1] == "Renamed":
                new_article_name = row[4]
                # Count the number of revisions under the old name
                # Get revisions based on article id and timestamp
                stmt_where = and_(
                    revisions.c.article_id.in_(search_ids),
                    revisions.c.timestamp <= timestamp)
                try:
                    old_timestamp = ratings[article_name]["timestamp"]
                    stmt_where = and_(stmt_where,
                        revisions.c.timestamp > old_timestamp)
                except KeyError:
                    pass
                stmt = select([func.count(revisions.c.internal_id)]) \
                    .where(stmt_where)
                result = conn.execute(stmt)
                try:
                    count = result.fetchone()[0]
                except StopIteration:
                    count = 0
                # N_g counts the number of revisions that occurred belowe rating `g`
                if old_rating in classes_b:
                    N_a += count
                elif old_rating in classes_c:
                    N_a += count
                    N_b += count
                elif old_rating in classes_start:
                    N_a += count
                    N_b += count
                    N_c += count
                # Update timestamp and rename article
                ratings[new_article_name] = {
                    "rating": old_rating, "timestamp": timestamp }
                try:
                    del ratings[article_name]
                except KeyError:
                    errors.append("    Renamed unknown article: %d: %s" %
                        (row[0], article_name))
            elif row[1] == "Removed":
                try:
                    del ratings[article_name]
                except KeyError:
                    errors.append("    Removed unknown article: %d: %s" %
                        (row[0], article_name))
        #if len(errors) > 0:
            #for error in errors:
                #log.warning(error)
            #errors = []
        log.info("Processed results for project %d, %d entries unknown"
                 % (project_id, unknown_name_count))
        datum = [project_id, Delta_a, Delta_b, Delta_c, N_a, N_b, N_c]
        out.write(",".join([str(x) for x in datum]) + "\n")
        out.flush()
except:
    log.error("Error: %s" % sys.exc_info()[0])
    raise
finally:
    conn.close()
    log.info("Database session closed")
    out.close()
log.info("Done.")