In [None]:
import calendar
import codecs
import csv
import datetime
import dateutil.parser
import json
import logging
import os
import os.path
import sys
import time
import sqlalchemy
import urllib

In [None]:
import database
from database.schema import Rating
import logbook

In [None]:
canonical_file = 'data/projects-2016-10-12.utf-16-le.tsv'
project_file = 'data/projects-2016-10-14.json'
project_dir = 'data/assessments_2017-03-20'
skip_file = 'skipped'
complete_file = 'completed'

In [None]:
exp_name = "06_load_assessment_history"
exp = logbook.Experiment(exp_name)
log = exp.get_logger()

In [None]:
project_ids = {}
with open(project_file, "rb") as f:
    for row in f:
        datum = json.loads(row, encoding="utf-8")
        project_ids[datum["project_name"]] = int(datum["project_id"])

In [None]:
project_names = {}
with codecs.open(canonical_file, encoding="utf-16-le") as f:
    projects_data = f.read()
for row in projects_data.split(u"\n"):
    if row.strip() == '':
        continue
    title, unique = row.split(u"\t")
    project_names[title] = unique

In [None]:
def load_project(project_name, conn, skipped, completed):
    if not isinstance(project_name, unicode):
        log.error("Project_name not unicode: %s" % project_name)
        raise AssertionError
    log.info('Loading grade history for: %s' % project_name)
    quoted_name = urllib.quote(
        project_name.replace(" ", "_").encode('utf-8'), safe="")
    project_file = os.path.join(
        project_dir, quoted_name + ".utf8.tsv")
    to_insert = []
    try:
        os.stat(project_file)
    except OSError:
        skipped.write(u"%s\t%s\n" % (project_name, u"File not found"))
        return
    with open(project_file, "rb") as f:
        f.next() # Skip header
        for i, row_text in enumerate(f):
            row = row_text.decode('utf-8').rstrip("\n").split(u"\t")
            print row_text
            print row
            print ""
            if i % 10000 == 0 and i != 0:
                log.info("  Inserting %d records" % len(to_insert))
                conn.execute(Rating.__table__.insert(), to_insert) 
                del to_insert
                to_insert = []
                time.sleep(0.1)
            project_name = row[0].decode('utf-8')
            try:
                unique_name = project_names[project_name]
                if not isinstance(unique_name, unicode):
                    log.error("Unique name not unicode: %s" % unique_name)
                    raise AssertionError
            except KeyError:
                skipped.write(u"%s\tNo canonical name\n" % project_name)
                return
            try:
                project_id = project_ids[unique_name]
            except KeyError:
                skipped.write(u"%s\tNo id\n" % project_name)
                return
            if i == 0:
                log.info('  %s: %s' % (project_id, unique_name))
            datum = {
                "project_unique": unique_name.encode('utf-8'),
                "project_name": row[0].decode('utf-8').strip(),
                "project_id": project_id,
                "timestamp": int(row[1]),
                "action": row[2].decode('utf-8').strip(),
                "article_name": row[3].decode('utf-8').strip().encode('utf-8'),
                "old_quality": row[4].decode('utf-8').strip(),
                "new_quality": row[5].decode('utf-8').strip(),
                "old_importance": row[6].decode('utf-8').strip(),
                "new_importance": row[7].decode('utf-8').strip(),
                "new_article_name": row[8].decode('utf-8').strip().encode('utf-8'),
                "old_article_url": row[9].decode('utf-8').strip(),
                "new_article_url": row[10].decode('utf-8').strip()
            }
            to_insert.append(datum)
    log.info("  Inserting final batch of %d" % len(to_insert))
    conn.execute(Rating.__table__.insert(), to_insert)
    completed.write(project_name + "\n")

In [None]:
conn = database.engine.connect()
try:
    log.info("Loading ratings for all projects")
    skipped = codecs.open(exp.get_filename(skip_file), "w", encoding='utf-8') 
    completed = codecs.open(exp.get_filename(complete_file), "w", encoding="utf-8")
    for row in projects_data.split(u"\n"):
        if row.strip() == '':
            continue
        project_name, project_unique = row.split(u"\t")
        load_project(project_name, conn, skipped, completed)
    log.info("Successful completion. Cleaning up...")
except:
    log.error("Error: %s" % str(sys.exc_info()))
    raise
finally:
    try:
        conn.close()
        skipped.close()
        completed.close()
    except:
        pass