Skip to content

Commit

Permalink
added a tool that can help us track down bad data while we go through…
Browse files Browse the repository at this point in the history
… this transition period. generates a report by looking at the database, the website and proposing what need be done
  • Loading branch information
Luke Skibinski committed Nov 16, 2016
1 parent 5186884 commit 8ab8d5d
Show file tree
Hide file tree
Showing 20 changed files with 7,640 additions and 54 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -18,3 +18,4 @@ ingestion-lax.log.json
aws-perms.sh
schema/
article-xml
.scrapy-cache
File renamed without changes.
19 changes: 0 additions & 19 deletions load-elife-json.sh

This file was deleted.

2 changes: 2 additions & 0 deletions requirements.txt
Expand Up @@ -4,6 +4,7 @@ coverage==4.1
Django==1.9.11
django-annoying==0.10.3
django-autoslug==1.9.3
django-db-logger==0.1.6
django-filter==0.15.3
django-markdown2==0.3.0
django-rest-swagger==2.1.0
Expand All @@ -24,6 +25,7 @@ python-slugify==1.2.1
pytz==2016.7
PyYAML==3.12
requests==2.11.1
Scrapy==1.2.1
unittest-xml-reporting==2.1.0
# sql-explorer optional deps
tinys3==0.1.12
74 changes: 42 additions & 32 deletions src/core/settings.py
Expand Up @@ -64,11 +64,12 @@ def cfg(path, default=0xDEADBEEF):
'django.contrib.messages',
'django.contrib.staticfiles',

'rest_framework',
'rest_framework_swagger',
'django_markdown2',
'django_markdown2', # landing page is rendered markdown
'explorer', # sql creation
#'django_db_logger', # logs certain entries to the database

'explorer',
'rest_framework',
'rest_framework_swagger', # gui for api

'publisher',
)
Expand All @@ -83,7 +84,7 @@ def cfg(path, default=0xDEADBEEF):
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',

'core.middleware.KongAuthentication',
'core.middleware.KongAuthentication', # sets a header if it looks like an authenticated request
]

ROOT_URLCONF = 'core.urls'
Expand Down Expand Up @@ -269,53 +270,62 @@ def writable(path):
},

'handlers': {
'file': {
# entries go to standard lax.log file
'lax.log': {
'level': 'DEBUG',
'class': 'logging.FileHandler',
'filename': LOG_FILE,
'formatter': 'json',
},
'ingestion': {

# entries go to stderr
'stderr': {
'level': 'DEBUG',
'class': 'logging.StreamHandler',
'formatter': 'brief',
},

# entries go to the lax-ingestion.log file
'ingestion.log': {
'level': 'DEBUG',
'class': 'logging.FileHandler',
'filename': INGESTION_LOG_FILE,
'formatter': 'json',
},
'debug-console': {
'level': 'DEBUG',
'class': 'logging.StreamHandler',
'formatter': 'brief',
},

# entries go to the database
#'database': {
# 'level': 'DEBUG',
# 'class': 'django_db_logger.db_log_handler.DatabaseLogHandler',
# 'formatter': 'json',
#},
},

'loggers': {
'': {
'handlers': ['debug-console', 'file'],
'handlers': ['stderr', 'lax.log'],
'level': 'INFO',
'propagate': True,
},
'publisher.eif_ingestor': {
'handlers': ['ingestion'],
},
'publisher.ejp_ingestor': {
'handlers': ['ingestion'],
},
'publisher.ajson_ingestor': {
'handlers': ['ingestion'],
#'propagate': False, # prevent propagation to root handler and it's debug-console handler
},
'publisher.management.commands.import': {
'level': 'INFO',
'handlers': ['debug-console'],
},
'publisher.management.commands.ingest': {
'level': 'INFO',
'handlers': ['ingestion', 'debug-console'],
},
'django.request': {
'handlers': ['file'],
'handlers': ['lax.log'],
'level': 'DEBUG',
'propagate': True,
},
},
}

x = [
'publisher.eif_ingestor',
'publisher.ejp_ingestor',
'publisher.ajson_ingestor',
'publisher.management.commands.import',
'publisher.management.commands.ingest',
]
logger = {
'level': 'INFO',
#'handlers': ['database', 'ingestion.log', 'lax.log', 'stderr'],
'handlers': ['ingestion.log', 'lax.log', 'stderr'],
'propagate': False, # don't propagate up to root logger
}
LOGGING['loggers'].update(dict(zip(x, [logger] * len(x))))
159 changes: 159 additions & 0 deletions src/publisher/management/commands/check_article.py
@@ -0,0 +1,159 @@
from functools import wraps
from django.core.management.base import BaseCommand
from publisher.utils import ensure, json_dumps
from publisher import models
import os
from os.path import join
#from datetime import datetime
import requests
from scrapy.selector import Selector
import re
import logging

logging.getLogger("requests").setLevel(logging.WARNING)
LOG = logging.getLogger(__name__)

OUTPUT_DIR = '.scrapy-cache'

def fname(msid):
msid = str(int(msid)) # normalize value, strip any leading zeroes
return join(OUTPUT_DIR, '%s-article-info.html' % msid)

def slurp(msid):
"download the article page for a given elife doi, write content to disk"
if os.path.exists(fname(msid)):
return

doi = "10.7554/eLife.%s" % str(msid).zfill(5)
# ll: https://elifesciences.org/lookup/doi/10.7554/eLife.17267
url = "https://elifesciences.org/lookup/doi/" + doi
LOG.debug(url)
resp = requests.get(url, allow_redirects=False)
ensure(resp.status_code != 404, "404 fetching article: %s" % resp.status_code)
art_info_url = resp.headers['Location'] + "/article-info"
resp2 = requests.get(art_info_url)

with open(fname(msid), 'wb') as handle:
handle.write(resp2.content)


def complement(pred):
@wraps(pred)
def wrapper(*args, **kwargs):
return not pred(*args, **kwargs)
return wrapper

def splitfilter(func, data):
return filter(func, data), filter(complement(func), data)


class Command(BaseCommand):
help = '''Repopulates the lax database using the contents of the elife-publishing-eif bucket. If article has multiple attempts, use the most recent attempt. Matching PUBLISHED articles will be downloaded and imported using the `import_articles` command. Published status is determined by an entry in the elife-publishing-archive bucket'''

def add_arguments(self, parser):
parser.add_argument('--msid', dest='msid-list', type=int, nargs='+', required=True)

def report(self, msid):
context = {
'msid': msid,
'in-database': False,
'unpublished-versions': [],
'published-versions': [],

'on-website': False,
'web-published-versions': [],

'state': 'unknown'
}

try:
art = models.Article.objects.get(manuscript_id=msid)
context['in-database'] = True

avlist = art.articleversion_set.all()
pub, unpub = splitfilter(lambda av: av.datetime_published, avlist)
context['published-versions'] = map(lambda av: av.version, pub)
context['unpublished-versions'] = map(lambda av: av.version, unpub)

except models.Article.DoesNotExist:
pass

try:
if not os.path.exists(fname(msid)):
slurp(msid)
context['on-website'] = True
except AssertionError:
context['on-website'] = False

if context['on-website']:
# scrape the website results, look for version history
with open(fname(msid), 'r') as handle:
contents = handle.read()
obj = Selector(text=contents)
root = obj.css("#panels-ajax-tab-container-elife-research-article-tabs ul.issue-toc-list li")
values = root.css("::text").extract()
cregex = re.compile(r"Version (?P<version>\d?) \((?P<datestr>.*)\)")
matches = map(lambda v: cregex.search(v).groupdict(), values)

def fn(match):
#dateobj = datetime.strptime(match['datestr'], "%B %d, %Y")
return int(match['version'])
# return {
# 'version': match['version'],
# 'pub-date': dateobj.strftime("%Y-%m-%d"),
#}
versions = map(fn, matches)
if not versions:
# article exists BUT it has no version history yet, so assume a v1
versions = [1]
context['web-published-versions'] = versions

LOG.debug("finished %s report" % msid, extra=context)

c = context

if not c['on-website']:
# not published, nothing to compare against yet
return context

if not c['in-database']:
# article is on website but not in database
context['state'] = 'article ingest missing in lax'
return context

# article is on website and in database
# compare versions

lax_pv, web_pv = c['published-versions'], c['web-published-versions']

if len(lax_pv) > len(web_pv):
# there are more versions in lax than in website!
missing_versions = set(lax_pv) - set(web_pv)
context['state'] = 'website is missing published versions (%s)' % ', '.join(map(str, missing_versions))
return context

if len(lax_pv) < len(web_pv):
missing_versions = set(web_pv) - set(lax_pv)
context['state'] = 'lax is missing published versions (%s)' % ', '.join(map(str, missing_versions))
return context

# both lax and web are reporting the same number of published versions
# ensure version lists are identical
if sorted(lax_pv) != sorted(web_pv):
context['state'] = 'lax has versions (%s) and website has versions (%s)' % (lax_pv, web_pv)
return context

context['state'] = 'no problems detected'

return context

def handle(self, *args, **options):
try:
msid_list = options['msid-list']
results = dict(zip(msid_list, map(self.report, options['msid-list'])))
self.stdout.write(json_dumps(results, indent=4))
self.stdout.flush()
exit(0)
except Exception as err:
LOG.exception("unhandled exception generating article report: %s", err)
exit(1)
12 changes: 11 additions & 1 deletion src/publisher/tests/base.py
@@ -1,6 +1,8 @@
from StringIO import StringIO
import os
from django.test import TestCase
from publisher import models
from django.core.management import call_command

class BaseCase(TestCase):
this_dir = os.path.dirname(os.path.realpath(__file__))
Expand All @@ -21,4 +23,12 @@ def unpublish(self, msid, version=None):
av = models.ArticleVersion.objects.get(article__manuscript_id=msid, version=version)
av.datetime_published = None
av.save()
# return av

def call_command(self, *args, **kwargs):
stdout = StringIO()
try:
kwargs['stdout'] = stdout
call_command(*args, **kwargs)
except SystemExit as err:
return err.code, stdout
self.fail("ingest script should always throw a systemexit()")

0 comments on commit 8ab8d5d

Please sign in to comment.