Skip to content
Browse files

Finishes the re-write for bulk files, closing #285.

  • Loading branch information
mlissner committed Sep 23, 2014
1 parent 86916f5 commit 52e8eff985fdf75612837cef4d9ef55ad60f29ad
No changes.
No changes.
@@ -0,0 +1,115 @@
import StringIO
import os
import shutil
import tarfile
import time
import errno

from alert.lib.db_tools import queryset_generator
from alert.lib.timer import print_timing
from import Court, Document
from import BaseCommand
from django.conf import settings
from audio.models import Audio

def mkdir_p(path):
"""Makes a directory path, but doesn't crash if the path already exists."""
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):

class Command(BaseCommand):
help = 'Create the bulk files for all jurisdictions and for "all".'

def handle(self, *args, **options):

def do_everything(self):
"""We can't wrap the handle() function, but we can wrap this one."""
from import api2
self.stdout.write('Starting bulk file creation...\n')
arg_tuples = (
('opinion', Document, api2.DocumentResource),
('oral-argument', Audio, api2.OralArgumentResource),
for obj_type_str, obj_type, api_resource_obj in arg_tuples:
self.make_archive(obj_type_str, obj_type, api_resource_obj)

def swap_archives(self, obj_type_str):
"""Swap out new archives for the old."""
self.stdout.write(' - Swapping in the new %s archives...\n'
% obj_type_str)
mkdir_p(os.path.join(settings.DUMP_DIR, '%s' % obj_type_str))
for f in os.listdir('/tmp/bulk/%s' % obj_type_str):
shutil.move('/tmp/bulk/%s/%s' % (obj_type_str, f),
os.path.join(settings.DUMP_DIR, '%ss' % obj_type_str))

def make_archive(self, obj_type_str, obj_type, api_resource_obj):
"""Generate compressed archives containing the contents of an object
There are a few tricks to this, but the main one is that each item in
the database goes into two files, all.tar.gz and {court}.tar.gz. This
means that if we want to avoid iterating the database once per file,
we need to generate all 350+ jurisdiction files simultaneously.
We do this by making a dict of open file handles and adding each item
to the correct two files: The all.tar.gz file and the {court}.tar.gz
courts = Court.objects.all()
self.stdout.write(' - Creating %s bulk %s files '
'simultaneously...\n' % (len(courts), obj_type_str))

mkdir_p('/tmp/bulk/%s' % obj_type_str)

# Open a gzip'ed tar file for every court
tar_files = {}
for court in courts:
tar_files[] =
'/tmp/bulk/%s/%s.tar.gz' % (obj_type_str,,
tar_files['all'] =
'/tmp/bulk/%s/all.tar.gz' % obj_type_str,

# Make the archives
qs = obj_type.objects.all()
item_resource = api_resource_obj()
item_list = queryset_generator(qs)
for item in item_list:
json_str = item_resource.serialize(

# Add the json str to the two tarballs
tarinfo = tarfile.TarInfo("%s.json" %
tarinfo.size = len(json_str)
tarinfo.mtime = time.mktime(item.date_modified.timetuple())
tarinfo.type = tarfile.REGTYPE

tarinfo, StringIO.StringIO(json_str))
tarinfo, StringIO.StringIO(json_str))

# Close off all the gzip'ed tar files
for court in courts:

self.stdout.write(' - all %s bulk files created.\n' % obj_type_str)
@@ -1,11 +1,8 @@
from datetime import timedelta
import os
import time
from django.conf import settings
from django.test import TestCase
from django.utils.timezone import now
from alert.lib.dump_lib import make_dump_file
from import Docket, Citation, Court, Document
from import Command

class BulkDataTest(TestCase):
@@ -28,98 +25,9 @@ def setUp(self):
) =
self.month = last_month.month
self.year = last_month.year = now().date()

def tearDown(self):

def test_no_year_provided_with_court_provided(self):
"""When a user doesn't provide a year and wants everything for a
particular court, do we properly throw a 400 error?
r = self.client.get('/api/bulk/test.xml.gz')
msg="Should have gotten HTTP code 400. Instead got: %s" % r.status_code

def test_no_year_provided_all_courts_requested(self):
"""If a user requests everything, do we give it to them?"""
start_moment = time.time()
qs = Document.objects.all()
filename = 'all.xml'
make_dump_file(qs, settings.DUMP_DIR, filename)
r = self.client.get('/api/bulk/all.xml.gz')

# Normally, the redirect hands the user off to Apache, which serves the file.
# Since we don't always have apache set up, we make sure we get redirected and
# we check that the file exists on disk with a non-zero filesize.
msg="Redirection to bulk file failed."
file_path = os.path.join(settings.DUMP_DIR, filename + '.gz')
msg="Bulk data file does not have content."
msg="File was created before the test was run, indicating it predates this test."

def test_year_based_bulk_file(self):
"""Do we generate and provide year-based bulk files properly?"""
r = self.client.get('/api/bulk/%s/test.xml.gz' % self.year)
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))

def test_month_based_bulk_file(self):
"""Do we generate and provide month-based bulk files properly?"""
r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.year, self.month))
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))

def test_day_based_bulk_file_twice(self):
"""Do we generate and provide day-based bulk files properly?
When they come from the cache the second time, does it still work?
r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month,
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))
# 2x!
r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month,
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))

def test_month_not_yet_complete(self):
"""A limitation is that we do not serve files until the month is complete.
Do we throw the proper error when this is the case?
r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (,
self.assertEqual(r.status_code, 400)
self.assertIn('partially in the future', r.content, msg="Did not get correct error message. "
"Instead got: %s" % r.content)

def test_month_completely_in_the_future(self):
"""Do we throw an error when a date in the future is requested?"""
r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % ( + 1,
self.assertEqual(r.status_code, 400)
self.assertIn('date is in the future', r.content, msg="Did not get correct error message. "
"Instead got: %s" % r.content)

def test_no_data_for_time_period(self):
"""If we lack data for a period of time, do we throw an error?"""
r = self.client.get('/api/bulk/1982/06/09/test.xml.gz')
self.assertEqual(r.status_code, 404)
self.assertIn('not have any data', r.content, msg="Did not get correct error message. "
"Instead got: %s" % r.content)
def test_make_all_bulk_files(self):
"""Can we successfully generate all bulk files?"""
@@ -1,27 +1,22 @@
from alert.api.views import (
court_index, documentation_index, dump_index, rest_index,
serve_or_gen_dump, serve_pagerank_file, coverage_data
serve_pagerank_file, coverage_data

from alert.urls import pacer_codes
from django.conf.urls import patterns

urlpatterns = patterns('',
# Documentation
(r'^api/$', documentation_index),
(r'^api/jurisdictions/$', court_index),
(r'^api/rest-info/$', rest_index),
(r'^api/bulk-info/$', dump_index),
(r'^api/bulk/(?P<court>all|%s)\.xml\.gz$' % "|".join(pacer_codes),
(r'^api/bulk/(?P<year>\d{4})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
(r'^api/bulk/(?P<year>\d{4})/(?P<month>\d{1,2})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
(r'^api/bulk/(?P<year>\d{4})/(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<court>all|%s)\.xml\.gz$' % "|".join(

# Pagerank file
(r'^api/bulk/external_pagerank/$', serve_pagerank_file),

# Coverage API
(r'^api/rest/v[12]/coverage/(all|%s)/' % '|'.join(pacer_codes),
@@ -1,21 +1,15 @@
import json
import os
from django.conf import settings

from alert import settings
from alert.lib import search_utils, magic
from alert.lib.db_tools import queryset_generator_by_date
from alert.lib.dump_lib import make_dump_file
from alert.lib.dump_lib import get_date_range
from alert.lib import magic
from alert.lib.filesize import size
from alert.lib.sunburnt import sunburnt
from import Court, Document
from import Court
from alert.stats import tally_stat

from django.http import HttpResponseBadRequest, Http404, HttpResponse, HttpResponseRedirect
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.utils.timezone import now
from lib import search_utils
from lib.sunburnt import sunburnt

@@ -94,85 +88,19 @@ def dump_index(request):
courts = make_court_variable()
court_count = len(courts)
dump_size = size(os.path.getsize(os.path.join(settings.DUMP_DIR, 'all.xml.gz')))
dump_size = size(os.path.getsize(
os.path.join(settings.DUMP_DIR, 'all.xml.gz')))
except os.error:
# Happens when the file is inaccessible or doesn't exist. An estimate.
dump_size = '13GB'
return render_to_response('api/dumps.html',
{'court_count': court_count,
'courts': courts,
'dump_size': dump_size,
'private': False},

def serve_or_gen_dump(request, court, year=None, month=None, day=None):
"""Serves the dump file to the user, generating it if needed."""
if year is None:
if court != 'all':
# Sanity check
return HttpResponseBadRequest('<h2>Error 400: Complete dumps are '
'not available for individual courts. Try using "all" for '
'your court ID instead.</h2>')
# Serve the dump for all cases.
return HttpResponseRedirect('/dumps/all.xml.gz')

# Date-based dump
start_date, end_date, annual, monthly, daily = get_date_range(year, month, day)

today = now().date()
# Ensure that it's a valid request.
if (today < end_date) and (today < start_date):
# It's the future. They fail.
return HttpResponseBadRequest('<h2>Error 400: Requested date is in the future. Please try again then.</h2>')
elif today <= end_date:
# Some of the data is in the past, some could be in the future.
return HttpResponseBadRequest('<h2>Error 400: Requested date is partially in the future. Please try again '

filename = court + '.xml'
if daily:
filepath = os.path.join(year, month, day)
elif monthly:
filepath = os.path.join(year, month)
elif annual:
filepath = os.path.join(year)

path_from_root = os.path.join(settings.DUMP_DIR, filepath)

# See if we already have it on disk.
_ = open(os.path.join(path_from_root, filename + '.gz'), 'rb')
return HttpResponseRedirect(os.path.join('/dumps', filepath, filename + '.gz'))
except IOError:
# Time-based dump
if court == 'all':
# dump everything; disable default ordering
qs = Document.objects.all().order_by()
# dump just the requested court; disable default ordering
qs = Document.objects.filter(docket__court=court).order_by()

# check if there are any documents at all
dump_has_docs = qs.filter(date_filed__gte=start_date,
if dump_has_docs:
docs_to_dump = queryset_generator_by_date(qs,

make_dump_file(docs_to_dump, path_from_root, filename)
return HttpResponseBadRequest('<h2>Error 404: We do not have any data for this time period.</h2>',

return HttpResponseRedirect('%s.gz' % os.path.join('/dumps', filepath, filename))
dump_size = 'about 13GB'
return render_to_response(
{'court_count': court_count,
'courts': courts,
'dump_size': dump_size,
'private': False},

def serve_pagerank_file(request):

0 comments on commit 52e8eff

Please sign in to comment.
You can’t perform that action at this time.