In [None]:
import json
import time
import sys
import sqlalchemy
from sqlalchemy import func, select
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import and_
import numpy as np
import scipy.stats as spstats

In [None]:
import database
from database.schema import article_name_id, Rating, revision_table

In [None]:
exp_name="24b_find_diversity"
project_file = 'data/projects-2016-10-14-dedup.json'
output_file = "diversity.csv"
end_timestamp = 1449084517
skip_to = 0

In [None]:
import logbook
exp = logbook.Experiment(exp_name)
log = exp.get_logger()
log.info("Beginning...")

In [None]:
log.info("Loading project json")
project_names = {}
with open(project_file, "rb") as f:
    for row in f:
        datum = json.loads(row)
        project_names[datum["project_id"]] = datum["project_name"]
log.info("Done loading project json")

In [None]:
log.info("Connecting to database")
conn = database.engine.connect()
data = []
out = open(exp.get_filename(output_file), "wb")
out.write("project_id,editor_entropy_wmean\n")
editor_project_counts = {}
contributors = set()
project_editors = {}
try:
    log.info("Counting project revisions for each editor")
    data = []
    for project_id, project_name in project_names.iteritems():
        log.info(
            "Counting editors for project project: %d:%s" %
            (project_id, project_name))
        project_editors[project_id] = set()
        log.info("  Opening table: %d_revisions" % project_id)
        revisions = revision_table(project_id).__table__
        log.info("  Querying")
        # Get each rating event in the project
        # Article and talk page ids have to be set first using sql scripts
        stmt = select([
                revisions.c.contributor_id,
                func.count(revisions.c.revision_id)]) \
                .group_by(revisions.c.contributor_id)
        result = conn.execute(stmt)
        for i, row in enumerate(result):
            if i > 0 and i % 10000 == 0:
                log.info("    %d rows complete" % (i)
                time.sleep(0.1)
            # Parse result
            contributor_id = row[0]
            count = row[1]
            contributors.add(contributor_id)
            project_editors[project_id].add(contributor_id)
            try:
                project_counts = editor_project_counts[contributor_id]
            except KeyError:
                project_counts = {}
                editor_project_counts[contributor_id] = project_counts
            project_counts[project_id] = count
        log.info("  Processed results for project %d"
                 % (project_id))
    log.info("Calculating entropy")
    contributor_entropy = []
    for contributor_id, project_counts in contributors:
        counts = np.array(project_counts.values)
        entropy = spstats.entropy(counts, base=10)
        contributor_entropy[contributor_id] = entropy
    for project_id, project_name in project_names.iteritems():
        log.info(
            "Calculating average entropy for project: %d:%s" %
            (project_id, project_name))
        entropy_total = 0.0
        entropy_count = 0
        for contributor_id in project_editors[project_id]:
            entropy_total += contributor_entropy[contributor_id]
            entropy_count += editor_project_counts[contributor_id][project_id]
        editor_entropy_wmean = entropy_total / float(entropy_count)
        out.write("%d,%s\n" % (project_id, repr(editor_entropy_wmean)))
        out.flush()
except:
    log.error("Error: %s" % sys.exc_info()[0])
    raise
finally:
    conn.close()
    log.info("Database session closed")
    out.close()
log.info("Done.")