added a tool that can help us track down bad data while we go through…

… this transition period. generates a report by looking at the database, the website and proposing what need be done
elifesciences · Nov 16, 2016 · 8ab8d5d · 8ab8d5d
1 parent 5186884
commit 8ab8d5d
Show file tree

Hide file tree

Showing 20 changed files with 7,640 additions and 54 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,4 @@ ingestion-lax.log.json
 aws-perms.sh
 schema/
 article-xml
+.scrapy-cache
diff --git a/load-json.sh → load-eif-json.sh b/load-json.sh → load-eif-json.sh
diff --git a/load-elife-json.sh b/load-elife-json.sh
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ coverage==4.1
 Django==1.9.11
 django-annoying==0.10.3
 django-autoslug==1.9.3
+django-db-logger==0.1.6
 django-filter==0.15.3
 django-markdown2==0.3.0
 django-rest-swagger==2.1.0
@@ -24,6 +25,7 @@ python-slugify==1.2.1
 pytz==2016.7
 PyYAML==3.12
 requests==2.11.1
+Scrapy==1.2.1
 unittest-xml-reporting==2.1.0
 # sql-explorer optional deps
 tinys3==0.1.12
diff --git a/src/core/settings.py b/src/core/settings.py
@@ -64,11 +64,12 @@ def cfg(path, default=0xDEADBEEF):
     'django.contrib.messages',
     'django.contrib.staticfiles',
 
-    'rest_framework',
-    'rest_framework_swagger',
-    'django_markdown2',
+    'django_markdown2', # landing page is rendered markdown
+    'explorer', # sql creation
+    #'django_db_logger', # logs certain entries to the database
 
-    'explorer',
+    'rest_framework',
+    'rest_framework_swagger', # gui for api
 
     'publisher',
 )
@@ -83,7 +84,7 @@ def cfg(path, default=0xDEADBEEF):
     'django.contrib.messages.middleware.MessageMiddleware',
     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 
-    'core.middleware.KongAuthentication',
+    'core.middleware.KongAuthentication', # sets a header if it looks like an authenticated request
 ]
 
 ROOT_URLCONF = 'core.urls'
@@ -269,53 +270,62 @@ def writable(path):
     },
 
     'handlers': {
-        'file': {
+        # entries go to standard lax.log file
+        'lax.log': {
             'level': 'DEBUG',
             'class': 'logging.FileHandler',
             'filename': LOG_FILE,
             'formatter': 'json',
         },
-        'ingestion': {
+
+        # entries go to stderr
+        'stderr': {
+            'level': 'DEBUG',
+            'class': 'logging.StreamHandler',
+            'formatter': 'brief',
+        },
+
+        # entries go to the lax-ingestion.log file
+        'ingestion.log': {
             'level': 'DEBUG',
             'class': 'logging.FileHandler',
             'filename': INGESTION_LOG_FILE,
             'formatter': 'json',
         },
-        'debug-console': {
-            'level': 'DEBUG',
-            'class': 'logging.StreamHandler',
-            'formatter': 'brief',
-        },
+
+        # entries go to the database
+        #'database': {
+        #    'level': 'DEBUG',
+        #    'class': 'django_db_logger.db_log_handler.DatabaseLogHandler',
+        #    'formatter': 'json',
+        #},
     },
 
     'loggers': {
         '': {
-            'handlers': ['debug-console', 'file'],
+            'handlers': ['stderr', 'lax.log'],
             'level': 'INFO',
             'propagate': True,
         },
-        'publisher.eif_ingestor': {
-            'handlers': ['ingestion'],
-        },
-        'publisher.ejp_ingestor': {
-            'handlers': ['ingestion'],
-        },
-        'publisher.ajson_ingestor': {
-            'handlers': ['ingestion'],
-            #'propagate': False, # prevent propagation to root handler and it's debug-console handler
-        },
-        'publisher.management.commands.import': {
-            'level': 'INFO',
-            'handlers': ['debug-console'],
-        },
-        'publisher.management.commands.ingest': {
-            'level': 'INFO',
-            'handlers': ['ingestion', 'debug-console'],
-        },
         'django.request': {
-            'handlers': ['file'],
+            'handlers': ['lax.log'],
             'level': 'DEBUG',
             'propagate': True,
         },
     },
 }
+
+x = [
+    'publisher.eif_ingestor',
+    'publisher.ejp_ingestor',
+    'publisher.ajson_ingestor',
+    'publisher.management.commands.import',
+    'publisher.management.commands.ingest',
+]
+logger = {
+    'level': 'INFO',
+    #'handlers': ['database', 'ingestion.log', 'lax.log', 'stderr'],
+    'handlers': ['ingestion.log', 'lax.log', 'stderr'],
+    'propagate': False, # don't propagate up to root logger
+}
+LOGGING['loggers'].update(dict(zip(x, [logger] * len(x))))
diff --git a/src/publisher/management/commands/check_article.py b/src/publisher/management/commands/check_article.py
@@ -0,0 +1,159 @@
+from functools import wraps
+from django.core.management.base import BaseCommand
+from publisher.utils import ensure, json_dumps
+from publisher import models
+import os
+from os.path import join
+#from datetime import datetime
+import requests
+from scrapy.selector import Selector
+import re
+import logging
+
+logging.getLogger("requests").setLevel(logging.WARNING)
+LOG = logging.getLogger(__name__)
+
+OUTPUT_DIR = '.scrapy-cache'
+
+def fname(msid):
+    msid = str(int(msid)) # normalize value, strip any leading zeroes
+    return join(OUTPUT_DIR, '%s-article-info.html' % msid)
+
+def slurp(msid):
+    "download the article page for a given elife doi, write content to disk"
+    if os.path.exists(fname(msid)):
+        return
+
+    doi = "10.7554/eLife.%s" % str(msid).zfill(5)
+    # ll: https://elifesciences.org/lookup/doi/10.7554/eLife.17267
+    url = "https://elifesciences.org/lookup/doi/" + doi
+    LOG.debug(url)
+    resp = requests.get(url, allow_redirects=False)
+    ensure(resp.status_code != 404, "404 fetching article: %s" % resp.status_code)
+    art_info_url = resp.headers['Location'] + "/article-info"
+    resp2 = requests.get(art_info_url)
+
+    with open(fname(msid), 'wb') as handle:
+        handle.write(resp2.content)
+
+
+def complement(pred):
+    @wraps(pred)
+    def wrapper(*args, **kwargs):
+        return not pred(*args, **kwargs)
+    return wrapper
+
+def splitfilter(func, data):
+    return filter(func, data), filter(complement(func), data)
+
+
+class Command(BaseCommand):
+    help = '''Repopulates the lax database using the contents of the elife-publishing-eif bucket. If article has multiple attempts, use the most recent attempt. Matching PUBLISHED articles will be downloaded and imported using the `import_articles` command. Published status is determined by an entry in the elife-publishing-archive bucket'''
+
+    def add_arguments(self, parser):
+        parser.add_argument('--msid', dest='msid-list', type=int, nargs='+', required=True)
+
+    def report(self, msid):
+        context = {
+            'msid': msid,
+            'in-database': False,
+            'unpublished-versions': [],
+            'published-versions': [],
+
+            'on-website': False,
+            'web-published-versions': [],
+
+            'state': 'unknown'
+        }
+
+        try:
+            art = models.Article.objects.get(manuscript_id=msid)
+            context['in-database'] = True
+
+            avlist = art.articleversion_set.all()
+            pub, unpub = splitfilter(lambda av: av.datetime_published, avlist)
+            context['published-versions'] = map(lambda av: av.version, pub)
+            context['unpublished-versions'] = map(lambda av: av.version, unpub)
+
+        except models.Article.DoesNotExist:
+            pass
+
+        try:
+            if not os.path.exists(fname(msid)):
+                slurp(msid)
+            context['on-website'] = True
+        except AssertionError:
+            context['on-website'] = False
+
+        if context['on-website']:
+            # scrape the website results, look for version history
+            with open(fname(msid), 'r') as handle:
+                contents = handle.read()
+                obj = Selector(text=contents)
+                root = obj.css("#panels-ajax-tab-container-elife-research-article-tabs ul.issue-toc-list li")
+                values = root.css("::text").extract()
+                cregex = re.compile(r"Version (?P<version>\d?) \((?P<datestr>.*)\)")
+                matches = map(lambda v: cregex.search(v).groupdict(), values)
+
+                def fn(match):
+                    #dateobj = datetime.strptime(match['datestr'], "%B %d, %Y")
+                    return int(match['version'])
+                    # return {
+                    #    'version': match['version'],
+                    #    'pub-date': dateobj.strftime("%Y-%m-%d"),
+                    #}
+                versions = map(fn, matches)
+                if not versions:
+                    # article exists BUT it has no version history yet, so assume a v1
+                    versions = [1]
+                context['web-published-versions'] = versions
+
+        LOG.debug("finished %s report" % msid, extra=context)
+
+        c = context
+
+        if not c['on-website']:
+            # not published, nothing to compare against yet
+            return context
+
+        if not c['in-database']:
+            # article is on website but not in database
+            context['state'] = 'article ingest missing in lax'
+            return context
+
+        # article is on website and in database
+        # compare versions
+
+        lax_pv, web_pv = c['published-versions'], c['web-published-versions']
+
+        if len(lax_pv) > len(web_pv):
+            # there are more versions in lax than in website!
+            missing_versions = set(lax_pv) - set(web_pv)
+            context['state'] = 'website is missing published versions (%s)' % ', '.join(map(str, missing_versions))
+            return context
+
+        if len(lax_pv) < len(web_pv):
+            missing_versions = set(web_pv) - set(lax_pv)
+            context['state'] = 'lax is missing published versions (%s)' % ', '.join(map(str, missing_versions))
+            return context
+
+        # both lax and web are reporting the same number of published versions
+        # ensure version lists are identical
+        if sorted(lax_pv) != sorted(web_pv):
+            context['state'] = 'lax has versions (%s) and website has versions (%s)' % (lax_pv, web_pv)
+            return context
+
+        context['state'] = 'no problems detected'
+
+        return context
+
+    def handle(self, *args, **options):
+        try:
+            msid_list = options['msid-list']
+            results = dict(zip(msid_list, map(self.report, options['msid-list'])))
+            self.stdout.write(json_dumps(results, indent=4))
+            self.stdout.flush()
+            exit(0)
+        except Exception as err:
+            LOG.exception("unhandled exception generating article report: %s", err)
+            exit(1)
diff --git a/src/publisher/tests/base.py b/src/publisher/tests/base.py
@@ -1,6 +1,8 @@
+from StringIO import StringIO
 import os
 from django.test import TestCase
 from publisher import models
+from django.core.management import call_command
 
 class BaseCase(TestCase):
     this_dir = os.path.dirname(os.path.realpath(__file__))
@@ -21,4 +23,12 @@ def unpublish(self, msid, version=None):
             av = models.ArticleVersion.objects.get(article__manuscript_id=msid, version=version)
             av.datetime_published = None
             av.save()
-        # return av
+
+    def call_command(self, *args, **kwargs):
+        stdout = StringIO()
+        try:
+            kwargs['stdout'] = stdout
+            call_command(*args, **kwargs)
+        except SystemExit as err:
+            return err.code, stdout
+        self.fail("ingest script should always throw a systemexit()")