In [None]:
import json
import time
import sys
import sqlalchemy
from sqlalchemy import func, select
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import and_

In [None]:
import database
from database.schema import article_name_id, Rating, revision_table

In [None]:
exp_name="10c_find_imp_stage_eff"
project_file = 'data/projects-2016-10-14-dedup.json'
map_file = "output/07_create_article_project_map/2017-03-27 14:10:49 77c76e1/articles_projects.m"
output_file = "efficiency.csv"
end_timestamp = 1449084517
skip_to = 0

In [None]:
import logbook
exp = logbook.Experiment(exp_name)
log = exp.get_logger()
log.info("Beginning...")

In [None]:
log.info("Loading project json")
project_names = {}
with open(project_file, "rb") as f:
    for row in f:
        datum = json.loads(row)
        project_names[datum["project_id"]] = datum["project_name"]
log.info("Done loading project json")

In [None]:
classes_a = set([
    "FA-Class",
    "GA-Class",
    "A-Class"
])
classes_b = set([
    "Bplus-Class",
    "B-Class"
])
classes_c = set([
    "C-Class"
])
classes_start = set([
    "",
    "Stub-Class",
    "Start-Class",
    "Draft-Class",
    "Merge-Class",
    "Needed-Class",
    "Unassessed",
    "Unassessed-Class",
    "Current-Class",
    "Future-Class"
])
classes_ab = classes_a | classes_b
classes_abc = classes_ab | classes_c
classes_other = set([
    "NotA-Class",
    "List-Class",
    "FL-Class",
    "AL-Class",
    "BL-Class",
    "CL-Class",
    "SL-Class",
    "FM-Class",
    "Category-Class",
    "Disambig-Class",
    "Portal-Class",
    "Template-Class",
    "Project-Class",
    "Book-Class",
    "File-Class",
    "Image-Class",
    "Redirect-Class",
    "NA-Class"
])
classes = classes_abc | classes_start | classes_other

canonical_rating = {}
for rating in classes_a:
    canonical_rating[rating] = "A-Class"
for rating in classes_b:
    canonical_rating[rating] = "B-Class"
for rating in classes_c:
    canonical_rating[rating] = "C-Class"
for rating in classes_start:
    canonical_rating[rating] = "Start-Class"

In [None]:
log.info("Connecting to database")
conn = database.engine.connect()
data = []
out = open(exp.get_filename(output_file), "wb")
out.write(",".join(
    ['project_id', 'Delta_a', 'Delta_b', 'Delta_c', 'N_a', 'N_b', 'N_c', 'Articles_a', 'Articles_b', 'Articles_c']) + "\n")
try:
    log.info("Calculating efficiency for projects starting with: %d" % skip_to)
    data = []
    for project_id, project_name in project_names.iteritems():
        if project_id < skip_to:
            continue
        log.info(
            "Calculating efficiency for project: %d:%s" %
            (project_id, project_name))
        ratings = {}
        rating_times = {}
        errors = []
        ignore_article_id = set()
        removed_names = set()
        # The parameters below are defined in the paper
        Delta_a = 0
        Delta_b = 0
        Delta_c = 0
        N_a = 0.0
        N_b = 0.0
        N_c = 0.0
        articles_a = set()
        articles_b = set()
        articles_c = set()
        log.info("  Opening table: %d_revisions" % project_id)
        revisions = revision_table(project_id).__table__
        log.info("  Querying for rating changes")
        # Get each rating event in the project
        # Article and talk page ids have to be set first using sql scripts
        table = Rating.__table__
        stmt = select([
                table.c.internal_id,
                table.c.action,
                table.c.article_name,
                table.c.old_quality,
                table.c.new_quality,
                table.c.new_article_name,
                table.c.timestamp,
                table.c.article_id,
                table.c.talk_id]) \
            .where(
                and_(
                    and_(
                        and_(
                            table.c.project_id == project_id,
                            table.c.timestamp < end_timestamp),
                        table.c.article_id != None),
                    table.c.important == True)) \
            .order_by(table.c.article_id, table.c.timestamp)
        result = conn.execute(stmt)
        proj_count = result.rowcount
        log.info("  Processing %d query results" % proj_count)
        last_article_id = -1
        article_stages = []
        last_rating = ""
        last_timestamp = 0
        for i, row in enumerate(result):
            if i > 0 and i % 10000 == 0:
                log.info("    %d of %d rows complete" % (i, proj_count))
                time.sleep(0.1)
            # Parse result
            action = row[1]
            article_name = row[2]
            old_quality = row[3]
            new_quality = row[4]
            timestamp = row[6]
            article_id = row[7]
            talk_id = row[8]
            # Check for ignored article
            if article_id in ignore_article_id:
                continue
            # Canonicalize rating
            try:
                old_quality = canonical_rating[old_quality]
            except KeyError:
                old_quality = "NA-Class"
            try:
                new_quality = canonical_rating[new_quality]
            except KeyError:
                new_quality = "NA-Class"
            # Check if we've moved on to a new article
            if article_id != last_article_id:
                if last_article_id != -1:
                    # Article complete, update efficiency stats
                    article_stages.append({
                            "start": last_timestamp,
                            "end": end_timestamp,
                            "duration": end_timestamp - last_timestamp,
                            "rating": last_rating
                        })
                    smoothed_stages = []
                    for stage in article_stages:
                        if stage["duration"] < 172800: # Two days
                            # Remove short-term changes, probably edit wars or bugs in log
                            try:
                                smoothed_stages[-1]["end"] = stage["end"]
                            except IndexError:
                                pass
                        elif (len(smoothed_stages) > 0
                                and smoothed_stages[-1]["rating"] == stage["rating"]):
                            # Combine stages with same canonical rating
                            smoothed_stages[-1]["end"] = stage["end"]
                        else:
                            smoothed_stages.append(stage)
                    # Update efficiency stats for each stage
                    for i, stage in enumerate(smoothed_stages):
                        try:
                            next_stage = smoothed_stages[i+1]
                        except IndexError:
                            break
                        # Skip NA-Class
                        if (stage["rating"] == "NA-Class"
                                or next_stage["rating"] == "NA-Class"):
                            continue
                        # Make a database query to count edits
                        search_ids = [last_article_id]
                        if last_talk_id is not None and last_talk_id > 0:
                            search_ids.append(last_talk_id)
                        stmt_where = and_(
                            and_(
                                revisions.c.article_id.in_(search_ids),
                                and_(
                                    revisions.c.timestamp <= stage["end"],
                                    revisions.c.timestamp >= stage["start"])),
                            revisions.c.important == True)
                        stmt = select([func.count(revisions.c.internal_id)]) \
                            .where(stmt_where)
                        result = conn.execute(stmt)
                        try:
                            count = result.fetchone()[0]
                        except StopIteration:
                            count = 0
                        # Assign edits to appropriate transitions
                        # If grades are jumped, split edits between them
                        if next_stage["rating"] == "A-Class":
                            if stage["rating"] == "B-Class":
                                articles_a.add(article_id)
                                Delta_a += 1
                                N_a += count
                            elif stage["rating"] == "C-Class":
                                articles_a.add(article_id)
                                articles_b.add(article_id)
                                Delta_a += 1
                                Delta_b += 1
                                N_a += float(count) / 2.0
                                N_b += float(count) / 2.0
                            elif stage["rating"] == "Start-Class":
                                articles_a.add(article_id)
                                articles_b.add(article_id)
                                articles_c.add(article_id)
                                Delta_a += 1
                                Delta_b += 1
                                Delta_c += 1
                                N_a += float(count) / 3.0
                                N_b += float(count) / 3.0
                                N_c += float(count) / 3.0
                        elif next_stage["rating"] == "B-Class":
                            if stage["rating"] == "C-Class":
                                Delta_b += 1
                                articles_b.add(article_id)
                                N_b += count
                            elif stage["rating"] == "Start-Class":
                                articles_b.add(article_id)
                                articles_c.add(article_id)
                                Delta_b += 1
                                Delta_c += 1
                                N_b += float(count) / 2.0
                                N_c += float(count) / 2.0
                        elif next_stage["rating"] == "C-Class":
                            if stage["rating"] == "Start-Class":
                                articles_c.add(article_id)
                                Delta_c += 1
                                N_c += count
                # Move on to new article
                # If straight to A, ignore
                if new_quality == "A-Class":
                    # Added to project after already complete, ignore
                    ignore_article_id.add(article_id)
                    continue
                # Skip forward to first assessment
                if action not in ["Assessed", "Reassessed", ""]:
                    continue
                # Start new article
                last_article_id = article_id
                last_talk_id = talk_id
                article_stages = []
                last_timestamp = 0
                last_rating = old_quality
            # Keep track of deleted articles
            if action == "Removed":
                article_stages.append({
                        "start": last_timestamp,
                        "end": timestamp,
                        "duration": timestamp - last_timestamp,
                        "rating": last_rating
                    })
                last_rating = "NA-Class"
                last_timestamp = timestamp
                removed_names.add(article_name)
            elif action in ["Assessed", "Reassessed"]:
                article_stages.append({
                        "start": last_timestamp,
                        "end": timestamp,
                        "duration": timestamp - last_timestamp,
                        "rating": last_rating
                    })
                last_rating = new_quality
                last_timestamp = timestamp
        log.info("  Processed results for project %d"
                 % (project_id))
        datum = [project_id, Delta_a, Delta_b, Delta_c, N_a, N_b, N_c, len(articles_a), len(articles_b), len(articles_c)]
        out.write(",".join([repr(x) for x in datum]) + "\n")
        out.flush()
except:
    log.error("Error: %s" % sys.exc_info()[0])
    raise
finally:
    conn.close()
    log.info("Database session closed")
    out.close()
log.info("Done.")