In [None]:
import calendar
import datetime
import json
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import scipy.stats as spstats
from sqlalchemy import distinct, func, select
import database
from database.schema import Rating, revision_table
import logbook

In [None]:
exp_name = "24d_find_article_talk"
exp = logbook.Experiment(exp_name)
log = exp.get_logger()
project_file = "data/projects-2016-10-14-dedup.json"
out_file = "article_talk.csv"

In [None]:
project_ids = []
with open(project_file, "rb") as f:
    for row in f:
        data = json.loads(row)
        project_ids.append(data["project_id"])

In [None]:
conn = database.engine.connect()
try:
    article_count = {}
    talk_count = {}
    for project_id in project_ids:
        log.info("Starting project %d" % project_id)
        tab = revision_table(project_id).__table__
        # Article count
        stmt = select([func.count(tab.c.article_namespace)]).where(tab.c.article_namespace == 0)
        result = conn.execute(stmt)
        count = result.fetchone()[0]
        if (count == 0):
            continue
        article_count[project_id] = count
        # Talk count
        stmt = select([func.count(tab.c.article_namespace)]).where(tab.c.article_namespace == 1)
        result = conn.execute(stmt)
        count = result.fetchone()[0]
        if (count == 0):
            continue
        talk_count[project_id] = count
finally:
    log.info("Finished querying database, writing output")
    conn.close()

In [None]:
df = pd.DataFrame({
    'project_id':dict([(x,x) for x in project_ids]),
    'article_count':article_count,
    'talk_count':talk_count})
df = df.set_index('project_id')
df.to_csv(exp.get_filename(out_file))
log.info("Script complete")