Finishes the re-write for bulk files, closing #285.

freelawproject · Sep 23, 2014 · 52e8eff · 52e8eff
1 parent 86916f5
commit 52e8eff
Show file tree

Hide file tree

Showing 24 changed files with 610 additions and 796 deletions.
diff --git a/alert/api/management/__init__.py b/alert/api/management/__init__.py
diff --git a/alert/api/management/commands/__init__.py b/alert/api/management/commands/__init__.py
diff --git a/alert/api/management/commands/cl_make_bulk_data.py b/alert/api/management/commands/cl_make_bulk_data.py
@@ -0,0 +1,115 @@
+import StringIO
+import os
+import shutil
+import tarfile
+import time
+import errno
+
+from alert.lib.db_tools import queryset_generator
+from alert.lib.timer import print_timing
+from alert.search.models import Court, Document
+from django.core.management import BaseCommand
+from django.conf import settings
+from audio.models import Audio
+
+
+def mkdir_p(path):
+    """Makes a directory path, but doesn't crash if the path already exists."""
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+
+class Command(BaseCommand):
+    help = 'Create the bulk files for all jurisdictions and for "all".'
+
+    def handle(self, *args, **options):
+        self.do_everything()
+
+    @print_timing
+    def do_everything(self):
+        """We can't wrap the handle() function, but we can wrap this one."""
+        from alert.search import api2
+        self.stdout.write('Starting bulk file creation...\n')
+        arg_tuples = (
+            ('opinion', Document, api2.DocumentResource),
+            ('oral-argument', Audio, api2.OralArgumentResource),
+        )
+        for obj_type_str, obj_type, api_resource_obj in arg_tuples:
+            self.make_archive(obj_type_str, obj_type, api_resource_obj)
+            self.swap_archives(obj_type_str)
+        self.stdout.write('Done.\n\n')
+
+    def swap_archives(self, obj_type_str):
+        """Swap out new archives for the old."""
+        self.stdout.write(' - Swapping in the new %s archives...\n'
+                          % obj_type_str)
+        mkdir_p(os.path.join(settings.DUMP_DIR, '%s' % obj_type_str))
+        for f in os.listdir('/tmp/bulk/%s' % obj_type_str):
+            shutil.move('/tmp/bulk/%s/%s' % (obj_type_str, f),
+                        os.path.join(settings.DUMP_DIR, '%ss' % obj_type_str))
+
+    def make_archive(self, obj_type_str, obj_type, api_resource_obj):
+        """Generate compressed archives containing the contents of an object
+        database.
+
+        There are a few tricks to this, but the main one is that each item in
+        the database goes into two files, all.tar.gz and {court}.tar.gz. This
+        means that if we want to avoid iterating the database once per file,
+        we need to generate all 350+ jurisdiction files simultaneously.
+
+        We do this by making a dict of open file handles and adding each item
+        to the correct two files: The all.tar.gz file and the {court}.tar.gz
+        file.
+        """
+        courts = Court.objects.all()
+        self.stdout.write(' - Creating %s bulk %s files '
+                          'simultaneously...\n' % (len(courts), obj_type_str))
+
+        mkdir_p('/tmp/bulk/%s' % obj_type_str)
+
+        # Open a gzip'ed tar file for every court
+        tar_files = {}
+        for court in courts:
+            tar_files[court.pk] = tarfile.open(
+                '/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk),
+                mode='w:gz'
+            )
+        tar_files['all'] = tarfile.open(
+            '/tmp/bulk/%s/all.tar.gz' % obj_type_str,
+            mode='w:gz'
+        )
+
+        # Make the archives
+        qs = obj_type.objects.all()
+        item_resource = api_resource_obj()
+        item_list = queryset_generator(qs)
+        for item in item_list:
+            json_str = item_resource.serialize(
+                None,
+                item_resource.full_dehydrate(
+                    item_resource.build_bundle(obj=item)),
+                'application/json',
+            ).encode('utf-8')
+
+            # Add the json str to the two tarballs
+            tarinfo = tarfile.TarInfo("%s.json" % item.pk)
+            tarinfo.size = len(json_str)
+            tarinfo.mtime = time.mktime(item.date_modified.timetuple())
+            tarinfo.type = tarfile.REGTYPE
+
+            tar_files[item.docket.court_id].addfile(
+                tarinfo, StringIO.StringIO(json_str))
+            tar_files['all'].addfile(
+                tarinfo, StringIO.StringIO(json_str))
+
+        # Close off all the gzip'ed tar files
+        for court in courts:
+            tar_files[court.pk].close()
+        tar_files['all'].close()
+
+        self.stdout.write(' - all %s bulk files created.\n' % obj_type_str)
diff --git a/alert/api/tests.py b/alert/api/tests.py
@@ -1,11 +1,8 @@
 from datetime import timedelta
-import os
-import time
-from django.conf import settings
 from django.test import TestCase
 from django.utils.timezone import now
-from alert.lib.dump_lib import make_dump_file
 from alert.search.models import Docket, Citation, Court, Document
+from api.management.commands.cl_make_bulk_data import Command
 
 
 class BulkDataTest(TestCase):
@@ -28,98 +25,9 @@ def setUp(self):
         )
         self.doc.save(index=False)
 
-        self.day = last_month.day
-        self.month = last_month.month
-        self.year = last_month.year
-        self.now = now().date()
-
     def tearDown(self):
         self.doc.delete()
 
-    def test_no_year_provided_with_court_provided(self):
-        """When a user doesn't provide a year and wants everything for a
-        particular court, do we properly throw a 400 error?
-        """
-        r = self.client.get('/api/bulk/test.xml.gz')
-        self.assertEqual(
-            r.status_code,
-            400,
-            msg="Should have gotten HTTP code 400. Instead got: %s" % r.status_code
-        )
-
-    def test_no_year_provided_all_courts_requested(self):
-        """If a user requests everything, do we give it to them?"""
-        start_moment = time.time()
-        qs = Document.objects.all()
-        filename = 'all.xml'
-        make_dump_file(qs, settings.DUMP_DIR, filename)
-        r = self.client.get('/api/bulk/all.xml.gz')
-
-        # Normally, the redirect hands the user off to Apache, which serves the file.
-        # Since we don't always have apache set up, we make sure we get redirected and
-        # we check that the file exists on disk with a non-zero filesize.
-        self.assertEqual(
-            r.status_code,
-            302,
-            msg="Redirection to bulk file failed."
-        )
-        file_path = os.path.join(settings.DUMP_DIR, filename + '.gz')
-        self.assertGreater(
-            os.path.getsize(file_path),
-            0,
-            msg="Bulk data file does not have content."
-        )
-        self.assertGreater(
-            os.stat(file_path).st_mtime,
-            start_moment,
-            msg="File was created before the test was run, indicating it predates this test."
-        )
-
-    def test_year_based_bulk_file(self):
-        """Do we generate and provide year-based bulk files properly?"""
-        r = self.client.get('/api/bulk/%s/test.xml.gz' % self.year)
-        self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
-                                                 (r.status_code, r.content))
-
-    def test_month_based_bulk_file(self):
-        """Do we generate and provide month-based bulk files properly?"""
-        r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.year, self.month))
-        self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
-                                                 (r.status_code, r.content))
-
-    def test_day_based_bulk_file_twice(self):
-        """Do we generate and provide day-based bulk files properly?
-
-        When they come from the cache the second time, does it still work?
-        """
-        r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month, self.day))
-        self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
-                                                 (r.status_code, r.content))
-        # 2x!
-        r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month, self.day))
-        self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
-                                                 (r.status_code, r.content))
-
-    def test_month_not_yet_complete(self):
-        """A limitation is that we do not serve files until the month is complete.
-
-        Do we throw the proper error when this is the case?
-        """
-        r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.now.year, self.now.month))
-        self.assertEqual(r.status_code, 400)
-        self.assertIn('partially in the future', r.content, msg="Did not get correct error message. "
-                                                                "Instead got: %s" % r.content)
-
-    def test_month_completely_in_the_future(self):
-        """Do we throw an error when a date in the future is requested?"""
-        r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.now.year + 1, self.now.month))
-        self.assertEqual(r.status_code, 400)
-        self.assertIn('date is in the future', r.content, msg="Did not get correct error message. "
-                                                              "Instead got: %s" % r.content)
-
-    def test_no_data_for_time_period(self):
-        """If we lack data for a period of time, do we throw an error?"""
-        r = self.client.get('/api/bulk/1982/06/09/test.xml.gz')
-        self.assertEqual(r.status_code, 404)
-        self.assertIn('not have any data', r.content, msg="Did not get correct error message. "
-                                                          "Instead got: %s" % r.content)
+    def test_make_all_bulk_files(self):
+        """Can we successfully generate all bulk files?"""
+        Command.do_everything()
diff --git a/alert/api/urls.py b/alert/api/urls.py
@@ -1,27 +1,22 @@
 from alert.api.views import (
     court_index, documentation_index, dump_index, rest_index,
-    serve_or_gen_dump, serve_pagerank_file, coverage_data
+    serve_pagerank_file, coverage_data
 )
+
 from alert.urls import pacer_codes
 from django.conf.urls import patterns
 
 urlpatterns = patterns('',
+    # Documentation
     (r'^api/$', documentation_index),
     (r'^api/jurisdictions/$', court_index),
     (r'^api/rest-info/$', rest_index),
     (r'^api/bulk-info/$', dump_index),
-    (r'^api/bulk/(?P<court>all|%s)\.xml\.gz$' % "|".join(pacer_codes),
-     serve_or_gen_dump),
-    (r'^api/bulk/(?P<year>\d{4})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
-        pacer_codes),
-     serve_or_gen_dump),
-    (r'^api/bulk/(?P<year>\d{4})/(?P<month>\d{1,2})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
-            pacer_codes),
-        serve_or_gen_dump),
-    (r'^api/bulk/(?P<year>\d{4})/(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
-            pacer_codes),
-        serve_or_gen_dump),
+
+    # Pagerank file
     (r'^api/bulk/external_pagerank/$', serve_pagerank_file),
+
+    # Coverage API
     (r'^api/rest/v[12]/coverage/(all|%s)/' % '|'.join(pacer_codes),
      coverage_data),
 )
diff --git a/alert/api/views.py b/alert/api/views.py
@@ -1,21 +1,15 @@
 import json
 import os
-from django.conf import settings
 
 from alert import settings
-from alert.lib import search_utils, magic
-from alert.lib.db_tools import queryset_generator_by_date
-from alert.lib.dump_lib import make_dump_file
-from alert.lib.dump_lib import get_date_range
+from alert.lib import magic
 from alert.lib.filesize import size
-from alert.lib.sunburnt import sunburnt
-from alert.search.models import Court, Document
+from alert.search.models import Court
 from alert.stats import tally_stat
 
-from django.http import HttpResponseBadRequest, Http404, HttpResponse, HttpResponseRedirect
+from django.http import Http404, HttpResponse, HttpResponseRedirect
 from django.shortcuts import render_to_response
 from django.template import RequestContext
-from django.utils.timezone import now
 from lib import search_utils
 from lib.sunburnt import sunburnt
 
@@ -94,85 +88,19 @@ def dump_index(request):
     courts = make_court_variable()
     court_count = len(courts)
     try:
-        dump_size = size(os.path.getsize(os.path.join(settings.DUMP_DIR, 'all.xml.gz')))
+        dump_size = size(os.path.getsize(
+            os.path.join(settings.DUMP_DIR, 'all.xml.gz')))
     except os.error:
         # Happens when the file is inaccessible or doesn't exist. An estimate.
-        dump_size = '13GB'
-    return render_to_response('api/dumps.html',
-                              {'court_count': court_count,
-                               'courts': courts,
-                               'dump_size': dump_size,
-                               'private': False},
-                              RequestContext(request))
-
-
-def serve_or_gen_dump(request, court, year=None, month=None, day=None):
-    """Serves the dump file to the user, generating it if needed."""
-    if year is None:
-        if court != 'all':
-            # Sanity check
-            return HttpResponseBadRequest('<h2>Error 400: Complete dumps are '
-                                          'not available for individual courts. Try using "all" for '
-                                          'your court ID instead.</h2>')
-        else:
-            # Serve the dump for all cases.
-            tally_stat('bulk_data.served.all')
-            return HttpResponseRedirect('/dumps/all.xml.gz')
-
-    else:
-        # Date-based dump
-        start_date, end_date, annual, monthly, daily = get_date_range(year, month, day)
-
-        today = now().date()
-        # Ensure that it's a valid request.
-        if (today < end_date) and (today < start_date):
-            # It's the future. They fail.
-            return HttpResponseBadRequest('<h2>Error 400: Requested date is in the future. Please try again then.</h2>')
-        elif today <= end_date:
-            # Some of the data is in the past, some could be in the future.
-            return HttpResponseBadRequest('<h2>Error 400: Requested date is partially in the future. Please try again '
-                                          'then.</h2>')
-
-    filename = court + '.xml'
-    if daily:
-        filepath = os.path.join(year, month, day)
-    elif monthly:
-        filepath = os.path.join(year, month)
-    elif annual:
-        filepath = os.path.join(year)
-
-    path_from_root = os.path.join(settings.DUMP_DIR, filepath)
-
-    # See if we already have it on disk.
-    try:
-        _ = open(os.path.join(path_from_root, filename + '.gz'), 'rb')
-        tally_stat('bulk_data.served.by_date')
-        return HttpResponseRedirect(os.path.join('/dumps', filepath, filename + '.gz'))
-    except IOError:
-        # Time-based dump
-        if court == 'all':
-            # dump everything; disable default ordering
-            qs = Document.objects.all().order_by()
-        else:
-            # dump just the requested court; disable default ordering
-            qs = Document.objects.filter(docket__court=court).order_by()
-
-        # check if there are any documents at all
-        dump_has_docs = qs.filter(date_filed__gte=start_date,
-                                  date_filed__lte=end_date).exists()
-        if dump_has_docs:
-            docs_to_dump = queryset_generator_by_date(qs,
-                                                      'date_filed',
-                                                      start_date,
-                                                      end_date)
-
-            make_dump_file(docs_to_dump, path_from_root, filename)
-        else:
-            return HttpResponseBadRequest('<h2>Error 404: We do not have any data for this time period.</h2>',
-                                          status=404)
-
-        tally_stat('bulk_data.served.by_date')
-        return HttpResponseRedirect('%s.gz' % os.path.join('/dumps', filepath, filename))
+        dump_size = 'about 13GB'
+    return render_to_response(
+        'api/bulk-data.html',
+        {'court_count': court_count,
+         'courts': courts,
+         'dump_size': dump_size,
+         'private': False},
+        RequestContext(request)
+    )
 
 
 def serve_pagerank_file(request):