Skip to content

Commit

Permalink
Finishes the re-write for bulk files, closing #285.
Browse files Browse the repository at this point in the history
  • Loading branch information
mlissner committed Sep 23, 2014
1 parent 86916f5 commit 52e8eff
Show file tree
Hide file tree
Showing 24 changed files with 610 additions and 796 deletions.
Empty file.
Empty file.
115 changes: 115 additions & 0 deletions alert/api/management/commands/cl_make_bulk_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import StringIO
import os
import shutil
import tarfile
import time
import errno

from alert.lib.db_tools import queryset_generator
from alert.lib.timer import print_timing
from alert.search.models import Court, Document
from django.core.management import BaseCommand
from django.conf import settings
from audio.models import Audio


def mkdir_p(path):
"""Makes a directory path, but doesn't crash if the path already exists."""
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise


class Command(BaseCommand):
help = 'Create the bulk files for all jurisdictions and for "all".'

def handle(self, *args, **options):
self.do_everything()

@print_timing
def do_everything(self):
"""We can't wrap the handle() function, but we can wrap this one."""
from alert.search import api2
self.stdout.write('Starting bulk file creation...\n')
arg_tuples = (
('opinion', Document, api2.DocumentResource),
('oral-argument', Audio, api2.OralArgumentResource),
)
for obj_type_str, obj_type, api_resource_obj in arg_tuples:
self.make_archive(obj_type_str, obj_type, api_resource_obj)
self.swap_archives(obj_type_str)
self.stdout.write('Done.\n\n')

def swap_archives(self, obj_type_str):
"""Swap out new archives for the old."""
self.stdout.write(' - Swapping in the new %s archives...\n'
% obj_type_str)
mkdir_p(os.path.join(settings.DUMP_DIR, '%s' % obj_type_str))
for f in os.listdir('/tmp/bulk/%s' % obj_type_str):
shutil.move('/tmp/bulk/%s/%s' % (obj_type_str, f),
os.path.join(settings.DUMP_DIR, '%ss' % obj_type_str))

def make_archive(self, obj_type_str, obj_type, api_resource_obj):
"""Generate compressed archives containing the contents of an object
database.
There are a few tricks to this, but the main one is that each item in
the database goes into two files, all.tar.gz and {court}.tar.gz. This
means that if we want to avoid iterating the database once per file,
we need to generate all 350+ jurisdiction files simultaneously.
We do this by making a dict of open file handles and adding each item
to the correct two files: The all.tar.gz file and the {court}.tar.gz
file.
"""
courts = Court.objects.all()
self.stdout.write(' - Creating %s bulk %s files '
'simultaneously...\n' % (len(courts), obj_type_str))

mkdir_p('/tmp/bulk/%s' % obj_type_str)

# Open a gzip'ed tar file for every court
tar_files = {}
for court in courts:
tar_files[court.pk] = tarfile.open(
'/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk),
mode='w:gz'
)
tar_files['all'] = tarfile.open(
'/tmp/bulk/%s/all.tar.gz' % obj_type_str,
mode='w:gz'
)

# Make the archives
qs = obj_type.objects.all()
item_resource = api_resource_obj()
item_list = queryset_generator(qs)
for item in item_list:
json_str = item_resource.serialize(
None,
item_resource.full_dehydrate(
item_resource.build_bundle(obj=item)),
'application/json',
).encode('utf-8')

# Add the json str to the two tarballs
tarinfo = tarfile.TarInfo("%s.json" % item.pk)
tarinfo.size = len(json_str)
tarinfo.mtime = time.mktime(item.date_modified.timetuple())
tarinfo.type = tarfile.REGTYPE

tar_files[item.docket.court_id].addfile(
tarinfo, StringIO.StringIO(json_str))
tar_files['all'].addfile(
tarinfo, StringIO.StringIO(json_str))

# Close off all the gzip'ed tar files
for court in courts:
tar_files[court.pk].close()
tar_files['all'].close()

self.stdout.write(' - all %s bulk files created.\n' % obj_type_str)
100 changes: 4 additions & 96 deletions alert/api/tests.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from datetime import timedelta
import os
import time
from django.conf import settings
from django.test import TestCase
from django.utils.timezone import now
from alert.lib.dump_lib import make_dump_file
from alert.search.models import Docket, Citation, Court, Document
from api.management.commands.cl_make_bulk_data import Command


class BulkDataTest(TestCase):
Expand All @@ -28,98 +25,9 @@ def setUp(self):
)
self.doc.save(index=False)

self.day = last_month.day
self.month = last_month.month
self.year = last_month.year
self.now = now().date()

def tearDown(self):
self.doc.delete()

def test_no_year_provided_with_court_provided(self):
"""When a user doesn't provide a year and wants everything for a
particular court, do we properly throw a 400 error?
"""
r = self.client.get('/api/bulk/test.xml.gz')
self.assertEqual(
r.status_code,
400,
msg="Should have gotten HTTP code 400. Instead got: %s" % r.status_code
)

def test_no_year_provided_all_courts_requested(self):
"""If a user requests everything, do we give it to them?"""
start_moment = time.time()
qs = Document.objects.all()
filename = 'all.xml'
make_dump_file(qs, settings.DUMP_DIR, filename)
r = self.client.get('/api/bulk/all.xml.gz')

# Normally, the redirect hands the user off to Apache, which serves the file.
# Since we don't always have apache set up, we make sure we get redirected and
# we check that the file exists on disk with a non-zero filesize.
self.assertEqual(
r.status_code,
302,
msg="Redirection to bulk file failed."
)
file_path = os.path.join(settings.DUMP_DIR, filename + '.gz')
self.assertGreater(
os.path.getsize(file_path),
0,
msg="Bulk data file does not have content."
)
self.assertGreater(
os.stat(file_path).st_mtime,
start_moment,
msg="File was created before the test was run, indicating it predates this test."
)

def test_year_based_bulk_file(self):
"""Do we generate and provide year-based bulk files properly?"""
r = self.client.get('/api/bulk/%s/test.xml.gz' % self.year)
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))

def test_month_based_bulk_file(self):
"""Do we generate and provide month-based bulk files properly?"""
r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.year, self.month))
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))

def test_day_based_bulk_file_twice(self):
"""Do we generate and provide day-based bulk files properly?
When they come from the cache the second time, does it still work?
"""
r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month, self.day))
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))
# 2x!
r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month, self.day))
self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" %
(r.status_code, r.content))

def test_month_not_yet_complete(self):
"""A limitation is that we do not serve files until the month is complete.
Do we throw the proper error when this is the case?
"""
r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.now.year, self.now.month))
self.assertEqual(r.status_code, 400)
self.assertIn('partially in the future', r.content, msg="Did not get correct error message. "
"Instead got: %s" % r.content)

def test_month_completely_in_the_future(self):
"""Do we throw an error when a date in the future is requested?"""
r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.now.year + 1, self.now.month))
self.assertEqual(r.status_code, 400)
self.assertIn('date is in the future', r.content, msg="Did not get correct error message. "
"Instead got: %s" % r.content)

def test_no_data_for_time_period(self):
"""If we lack data for a period of time, do we throw an error?"""
r = self.client.get('/api/bulk/1982/06/09/test.xml.gz')
self.assertEqual(r.status_code, 404)
self.assertIn('not have any data', r.content, msg="Did not get correct error message. "
"Instead got: %s" % r.content)
def test_make_all_bulk_files(self):
"""Can we successfully generate all bulk files?"""
Command.do_everything()
19 changes: 7 additions & 12 deletions alert/api/urls.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,22 @@
from alert.api.views import (
court_index, documentation_index, dump_index, rest_index,
serve_or_gen_dump, serve_pagerank_file, coverage_data
serve_pagerank_file, coverage_data
)

from alert.urls import pacer_codes
from django.conf.urls import patterns

urlpatterns = patterns('',
# Documentation
(r'^api/$', documentation_index),
(r'^api/jurisdictions/$', court_index),
(r'^api/rest-info/$', rest_index),
(r'^api/bulk-info/$', dump_index),
(r'^api/bulk/(?P<court>all|%s)\.xml\.gz$' % "|".join(pacer_codes),
serve_or_gen_dump),
(r'^api/bulk/(?P<year>\d{4})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
pacer_codes),
serve_or_gen_dump),
(r'^api/bulk/(?P<year>\d{4})/(?P<month>\d{1,2})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
pacer_codes),
serve_or_gen_dump),
(r'^api/bulk/(?P<year>\d{4})/(?P<month>\d{1,2})/(?P<day>\d{1,2})/(?P<court>all|%s)\.xml\.gz$' % "|".join(
pacer_codes),
serve_or_gen_dump),

# Pagerank file
(r'^api/bulk/external_pagerank/$', serve_pagerank_file),

# Coverage API
(r'^api/rest/v[12]/coverage/(all|%s)/' % '|'.join(pacer_codes),
coverage_data),
)
100 changes: 14 additions & 86 deletions alert/api/views.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
import json
import os
from django.conf import settings

from alert import settings
from alert.lib import search_utils, magic
from alert.lib.db_tools import queryset_generator_by_date
from alert.lib.dump_lib import make_dump_file
from alert.lib.dump_lib import get_date_range
from alert.lib import magic
from alert.lib.filesize import size
from alert.lib.sunburnt import sunburnt
from alert.search.models import Court, Document
from alert.search.models import Court
from alert.stats import tally_stat

from django.http import HttpResponseBadRequest, Http404, HttpResponse, HttpResponseRedirect
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.utils.timezone import now
from lib import search_utils
from lib.sunburnt import sunburnt

Expand Down Expand Up @@ -94,85 +88,19 @@ def dump_index(request):
courts = make_court_variable()
court_count = len(courts)
try:
dump_size = size(os.path.getsize(os.path.join(settings.DUMP_DIR, 'all.xml.gz')))
dump_size = size(os.path.getsize(
os.path.join(settings.DUMP_DIR, 'all.xml.gz')))
except os.error:
# Happens when the file is inaccessible or doesn't exist. An estimate.
dump_size = '13GB'
return render_to_response('api/dumps.html',
{'court_count': court_count,
'courts': courts,
'dump_size': dump_size,
'private': False},
RequestContext(request))


def serve_or_gen_dump(request, court, year=None, month=None, day=None):
"""Serves the dump file to the user, generating it if needed."""
if year is None:
if court != 'all':
# Sanity check
return HttpResponseBadRequest('<h2>Error 400: Complete dumps are '
'not available for individual courts. Try using "all" for '
'your court ID instead.</h2>')
else:
# Serve the dump for all cases.
tally_stat('bulk_data.served.all')
return HttpResponseRedirect('/dumps/all.xml.gz')

else:
# Date-based dump
start_date, end_date, annual, monthly, daily = get_date_range(year, month, day)

today = now().date()
# Ensure that it's a valid request.
if (today < end_date) and (today < start_date):
# It's the future. They fail.
return HttpResponseBadRequest('<h2>Error 400: Requested date is in the future. Please try again then.</h2>')
elif today <= end_date:
# Some of the data is in the past, some could be in the future.
return HttpResponseBadRequest('<h2>Error 400: Requested date is partially in the future. Please try again '
'then.</h2>')

filename = court + '.xml'
if daily:
filepath = os.path.join(year, month, day)
elif monthly:
filepath = os.path.join(year, month)
elif annual:
filepath = os.path.join(year)

path_from_root = os.path.join(settings.DUMP_DIR, filepath)

# See if we already have it on disk.
try:
_ = open(os.path.join(path_from_root, filename + '.gz'), 'rb')
tally_stat('bulk_data.served.by_date')
return HttpResponseRedirect(os.path.join('/dumps', filepath, filename + '.gz'))
except IOError:
# Time-based dump
if court == 'all':
# dump everything; disable default ordering
qs = Document.objects.all().order_by()
else:
# dump just the requested court; disable default ordering
qs = Document.objects.filter(docket__court=court).order_by()

# check if there are any documents at all
dump_has_docs = qs.filter(date_filed__gte=start_date,
date_filed__lte=end_date).exists()
if dump_has_docs:
docs_to_dump = queryset_generator_by_date(qs,
'date_filed',
start_date,
end_date)

make_dump_file(docs_to_dump, path_from_root, filename)
else:
return HttpResponseBadRequest('<h2>Error 404: We do not have any data for this time period.</h2>',
status=404)

tally_stat('bulk_data.served.by_date')
return HttpResponseRedirect('%s.gz' % os.path.join('/dumps', filepath, filename))
dump_size = 'about 13GB'
return render_to_response(
'api/bulk-data.html',
{'court_count': court_count,
'courts': courts,
'dump_size': dump_size,
'private': False},
RequestContext(request)
)


def serve_pagerank_file(request):
Expand Down
Loading

0 comments on commit 52e8eff

Please sign in to comment.