From 52e8eff985fdf75612837cef4d9ef55ad60f29ad Mon Sep 17 00:00:00 2001 From: Michael Lissner Date: Mon, 22 Sep 2014 18:10:30 -0700 Subject: [PATCH] Finishes the re-write for bulk files, closing #285. --- alert/api/management/__init__.py | 0 alert/api/management/commands/__init__.py | 0 .../management/commands/cl_make_bulk_data.py | 115 ++++++++ alert/api/tests.py | 100 +------ alert/api/urls.py | 19 +- alert/api/views.py | 100 +------ alert/assets/templates/api/bulk-data.html | 199 +++++++++++++ alert/assets/templates/api/dumps.html | 151 ---------- alert/audio/urls.py | 8 +- .../commands/cl_send_donation_reminders.py | 2 - alert/dump_all_cases.py | 20 -- alert/lib/dump_lib.py | 264 ------------------ alert/lib/search_utils.py | 10 +- alert/lib/timer.py | 18 +- alert/scrapers/tests.py | 21 +- alert/search/api.py | 128 +++++---- alert/search/api2.py | 179 ++++++++---- alert/search/forms.py | 13 +- alert/search/urls.py | 11 +- alert/search/views.py | 10 +- alert/settings/10-public.py | 2 +- alert/urls.py | 1 - apache/courtlistener.com.conf | 2 +- upgrade.txt | 33 ++- 24 files changed, 610 insertions(+), 796 deletions(-) create mode 100644 alert/api/management/__init__.py create mode 100644 alert/api/management/commands/__init__.py create mode 100644 alert/api/management/commands/cl_make_bulk_data.py create mode 100644 alert/assets/templates/api/bulk-data.html delete mode 100644 alert/assets/templates/api/dumps.html delete mode 100644 alert/dump_all_cases.py delete mode 100644 alert/lib/dump_lib.py diff --git a/alert/api/management/__init__.py b/alert/api/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/alert/api/management/commands/__init__.py b/alert/api/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/alert/api/management/commands/cl_make_bulk_data.py b/alert/api/management/commands/cl_make_bulk_data.py new file mode 100644 index 0000000000..aac5810a69 --- /dev/null +++ b/alert/api/management/commands/cl_make_bulk_data.py @@ -0,0 +1,115 @@ +import StringIO +import os +import shutil +import tarfile +import time +import errno + +from alert.lib.db_tools import queryset_generator +from alert.lib.timer import print_timing +from alert.search.models import Court, Document +from django.core.management import BaseCommand +from django.conf import settings +from audio.models import Audio + + +def mkdir_p(path): + """Makes a directory path, but doesn't crash if the path already exists.""" + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + + +class Command(BaseCommand): + help = 'Create the bulk files for all jurisdictions and for "all".' + + def handle(self, *args, **options): + self.do_everything() + + @print_timing + def do_everything(self): + """We can't wrap the handle() function, but we can wrap this one.""" + from alert.search import api2 + self.stdout.write('Starting bulk file creation...\n') + arg_tuples = ( + ('opinion', Document, api2.DocumentResource), + ('oral-argument', Audio, api2.OralArgumentResource), + ) + for obj_type_str, obj_type, api_resource_obj in arg_tuples: + self.make_archive(obj_type_str, obj_type, api_resource_obj) + self.swap_archives(obj_type_str) + self.stdout.write('Done.\n\n') + + def swap_archives(self, obj_type_str): + """Swap out new archives for the old.""" + self.stdout.write(' - Swapping in the new %s archives...\n' + % obj_type_str) + mkdir_p(os.path.join(settings.DUMP_DIR, '%s' % obj_type_str)) + for f in os.listdir('/tmp/bulk/%s' % obj_type_str): + shutil.move('/tmp/bulk/%s/%s' % (obj_type_str, f), + os.path.join(settings.DUMP_DIR, '%ss' % obj_type_str)) + + def make_archive(self, obj_type_str, obj_type, api_resource_obj): + """Generate compressed archives containing the contents of an object + database. + + There are a few tricks to this, but the main one is that each item in + the database goes into two files, all.tar.gz and {court}.tar.gz. This + means that if we want to avoid iterating the database once per file, + we need to generate all 350+ jurisdiction files simultaneously. + + We do this by making a dict of open file handles and adding each item + to the correct two files: The all.tar.gz file and the {court}.tar.gz + file. + """ + courts = Court.objects.all() + self.stdout.write(' - Creating %s bulk %s files ' + 'simultaneously...\n' % (len(courts), obj_type_str)) + + mkdir_p('/tmp/bulk/%s' % obj_type_str) + + # Open a gzip'ed tar file for every court + tar_files = {} + for court in courts: + tar_files[court.pk] = tarfile.open( + '/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk), + mode='w:gz' + ) + tar_files['all'] = tarfile.open( + '/tmp/bulk/%s/all.tar.gz' % obj_type_str, + mode='w:gz' + ) + + # Make the archives + qs = obj_type.objects.all() + item_resource = api_resource_obj() + item_list = queryset_generator(qs) + for item in item_list: + json_str = item_resource.serialize( + None, + item_resource.full_dehydrate( + item_resource.build_bundle(obj=item)), + 'application/json', + ).encode('utf-8') + + # Add the json str to the two tarballs + tarinfo = tarfile.TarInfo("%s.json" % item.pk) + tarinfo.size = len(json_str) + tarinfo.mtime = time.mktime(item.date_modified.timetuple()) + tarinfo.type = tarfile.REGTYPE + + tar_files[item.docket.court_id].addfile( + tarinfo, StringIO.StringIO(json_str)) + tar_files['all'].addfile( + tarinfo, StringIO.StringIO(json_str)) + + # Close off all the gzip'ed tar files + for court in courts: + tar_files[court.pk].close() + tar_files['all'].close() + + self.stdout.write(' - all %s bulk files created.\n' % obj_type_str) diff --git a/alert/api/tests.py b/alert/api/tests.py index bb8bafc978..0facc0fb07 100644 --- a/alert/api/tests.py +++ b/alert/api/tests.py @@ -1,11 +1,8 @@ from datetime import timedelta -import os -import time -from django.conf import settings from django.test import TestCase from django.utils.timezone import now -from alert.lib.dump_lib import make_dump_file from alert.search.models import Docket, Citation, Court, Document +from api.management.commands.cl_make_bulk_data import Command class BulkDataTest(TestCase): @@ -28,98 +25,9 @@ def setUp(self): ) self.doc.save(index=False) - self.day = last_month.day - self.month = last_month.month - self.year = last_month.year - self.now = now().date() - def tearDown(self): self.doc.delete() - def test_no_year_provided_with_court_provided(self): - """When a user doesn't provide a year and wants everything for a - particular court, do we properly throw a 400 error? - """ - r = self.client.get('/api/bulk/test.xml.gz') - self.assertEqual( - r.status_code, - 400, - msg="Should have gotten HTTP code 400. Instead got: %s" % r.status_code - ) - - def test_no_year_provided_all_courts_requested(self): - """If a user requests everything, do we give it to them?""" - start_moment = time.time() - qs = Document.objects.all() - filename = 'all.xml' - make_dump_file(qs, settings.DUMP_DIR, filename) - r = self.client.get('/api/bulk/all.xml.gz') - - # Normally, the redirect hands the user off to Apache, which serves the file. - # Since we don't always have apache set up, we make sure we get redirected and - # we check that the file exists on disk with a non-zero filesize. - self.assertEqual( - r.status_code, - 302, - msg="Redirection to bulk file failed." - ) - file_path = os.path.join(settings.DUMP_DIR, filename + '.gz') - self.assertGreater( - os.path.getsize(file_path), - 0, - msg="Bulk data file does not have content." - ) - self.assertGreater( - os.stat(file_path).st_mtime, - start_moment, - msg="File was created before the test was run, indicating it predates this test." - ) - - def test_year_based_bulk_file(self): - """Do we generate and provide year-based bulk files properly?""" - r = self.client.get('/api/bulk/%s/test.xml.gz' % self.year) - self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" % - (r.status_code, r.content)) - - def test_month_based_bulk_file(self): - """Do we generate and provide month-based bulk files properly?""" - r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.year, self.month)) - self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" % - (r.status_code, r.content)) - - def test_day_based_bulk_file_twice(self): - """Do we generate and provide day-based bulk files properly? - - When they come from the cache the second time, does it still work? - """ - r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month, self.day)) - self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" % - (r.status_code, r.content)) - # 2x! - r = self.client.get('/api/bulk/%s/%s/%s/test.xml.gz' % (self.year, self.month, self.day)) - self.assertEqual(r.status_code, 302, msg="Got status code of %s with content: %s" % - (r.status_code, r.content)) - - def test_month_not_yet_complete(self): - """A limitation is that we do not serve files until the month is complete. - - Do we throw the proper error when this is the case? - """ - r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.now.year, self.now.month)) - self.assertEqual(r.status_code, 400) - self.assertIn('partially in the future', r.content, msg="Did not get correct error message. " - "Instead got: %s" % r.content) - - def test_month_completely_in_the_future(self): - """Do we throw an error when a date in the future is requested?""" - r = self.client.get('/api/bulk/%s/%s/test.xml.gz' % (self.now.year + 1, self.now.month)) - self.assertEqual(r.status_code, 400) - self.assertIn('date is in the future', r.content, msg="Did not get correct error message. " - "Instead got: %s" % r.content) - - def test_no_data_for_time_period(self): - """If we lack data for a period of time, do we throw an error?""" - r = self.client.get('/api/bulk/1982/06/09/test.xml.gz') - self.assertEqual(r.status_code, 404) - self.assertIn('not have any data', r.content, msg="Did not get correct error message. " - "Instead got: %s" % r.content) + def test_make_all_bulk_files(self): + """Can we successfully generate all bulk files?""" + Command.do_everything() diff --git a/alert/api/urls.py b/alert/api/urls.py index e48b748917..66467b8303 100644 --- a/alert/api/urls.py +++ b/alert/api/urls.py @@ -1,27 +1,22 @@ from alert.api.views import ( court_index, documentation_index, dump_index, rest_index, - serve_or_gen_dump, serve_pagerank_file, coverage_data + serve_pagerank_file, coverage_data ) + from alert.urls import pacer_codes from django.conf.urls import patterns urlpatterns = patterns('', + # Documentation (r'^api/$', documentation_index), (r'^api/jurisdictions/$', court_index), (r'^api/rest-info/$', rest_index), (r'^api/bulk-info/$', dump_index), - (r'^api/bulk/(?Pall|%s)\.xml\.gz$' % "|".join(pacer_codes), - serve_or_gen_dump), - (r'^api/bulk/(?P\d{4})/(?Pall|%s)\.xml\.gz$' % "|".join( - pacer_codes), - serve_or_gen_dump), - (r'^api/bulk/(?P\d{4})/(?P\d{1,2})/(?Pall|%s)\.xml\.gz$' % "|".join( - pacer_codes), - serve_or_gen_dump), - (r'^api/bulk/(?P\d{4})/(?P\d{1,2})/(?P\d{1,2})/(?Pall|%s)\.xml\.gz$' % "|".join( - pacer_codes), - serve_or_gen_dump), + + # Pagerank file (r'^api/bulk/external_pagerank/$', serve_pagerank_file), + + # Coverage API (r'^api/rest/v[12]/coverage/(all|%s)/' % '|'.join(pacer_codes), coverage_data), ) diff --git a/alert/api/views.py b/alert/api/views.py index ce726ff9fc..beecfb1f2c 100644 --- a/alert/api/views.py +++ b/alert/api/views.py @@ -1,21 +1,15 @@ import json import os -from django.conf import settings from alert import settings -from alert.lib import search_utils, magic -from alert.lib.db_tools import queryset_generator_by_date -from alert.lib.dump_lib import make_dump_file -from alert.lib.dump_lib import get_date_range +from alert.lib import magic from alert.lib.filesize import size -from alert.lib.sunburnt import sunburnt -from alert.search.models import Court, Document +from alert.search.models import Court from alert.stats import tally_stat -from django.http import HttpResponseBadRequest, Http404, HttpResponse, HttpResponseRedirect +from django.http import Http404, HttpResponse, HttpResponseRedirect from django.shortcuts import render_to_response from django.template import RequestContext -from django.utils.timezone import now from lib import search_utils from lib.sunburnt import sunburnt @@ -94,85 +88,19 @@ def dump_index(request): courts = make_court_variable() court_count = len(courts) try: - dump_size = size(os.path.getsize(os.path.join(settings.DUMP_DIR, 'all.xml.gz'))) + dump_size = size(os.path.getsize( + os.path.join(settings.DUMP_DIR, 'all.xml.gz'))) except os.error: # Happens when the file is inaccessible or doesn't exist. An estimate. - dump_size = '13GB' - return render_to_response('api/dumps.html', - {'court_count': court_count, - 'courts': courts, - 'dump_size': dump_size, - 'private': False}, - RequestContext(request)) - - -def serve_or_gen_dump(request, court, year=None, month=None, day=None): - """Serves the dump file to the user, generating it if needed.""" - if year is None: - if court != 'all': - # Sanity check - return HttpResponseBadRequest('

Error 400: Complete dumps are ' - 'not available for individual courts. Try using "all" for ' - 'your court ID instead.

') - else: - # Serve the dump for all cases. - tally_stat('bulk_data.served.all') - return HttpResponseRedirect('/dumps/all.xml.gz') - - else: - # Date-based dump - start_date, end_date, annual, monthly, daily = get_date_range(year, month, day) - - today = now().date() - # Ensure that it's a valid request. - if (today < end_date) and (today < start_date): - # It's the future. They fail. - return HttpResponseBadRequest('

Error 400: Requested date is in the future. Please try again then.

') - elif today <= end_date: - # Some of the data is in the past, some could be in the future. - return HttpResponseBadRequest('

Error 400: Requested date is partially in the future. Please try again ' - 'then.

') - - filename = court + '.xml' - if daily: - filepath = os.path.join(year, month, day) - elif monthly: - filepath = os.path.join(year, month) - elif annual: - filepath = os.path.join(year) - - path_from_root = os.path.join(settings.DUMP_DIR, filepath) - - # See if we already have it on disk. - try: - _ = open(os.path.join(path_from_root, filename + '.gz'), 'rb') - tally_stat('bulk_data.served.by_date') - return HttpResponseRedirect(os.path.join('/dumps', filepath, filename + '.gz')) - except IOError: - # Time-based dump - if court == 'all': - # dump everything; disable default ordering - qs = Document.objects.all().order_by() - else: - # dump just the requested court; disable default ordering - qs = Document.objects.filter(docket__court=court).order_by() - - # check if there are any documents at all - dump_has_docs = qs.filter(date_filed__gte=start_date, - date_filed__lte=end_date).exists() - if dump_has_docs: - docs_to_dump = queryset_generator_by_date(qs, - 'date_filed', - start_date, - end_date) - - make_dump_file(docs_to_dump, path_from_root, filename) - else: - return HttpResponseBadRequest('

Error 404: We do not have any data for this time period.

', - status=404) - - tally_stat('bulk_data.served.by_date') - return HttpResponseRedirect('%s.gz' % os.path.join('/dumps', filepath, filename)) + dump_size = 'about 13GB' + return render_to_response( + 'api/bulk-data.html', + {'court_count': court_count, + 'courts': courts, + 'dump_size': dump_size, + 'private': False}, + RequestContext(request) + ) def serve_pagerank_file(request): diff --git a/alert/assets/templates/api/bulk-data.html b/alert/assets/templates/api/bulk-data.html new file mode 100644 index 0000000000..2d653bd58d --- /dev/null +++ b/alert/assets/templates/api/bulk-data.html @@ -0,0 +1,199 @@ +{% extends "base.html" %} + +{% block title %}Bulk Data - CourtListener.com{% endblock %} +{% block search-form %}{% endblock %} + +{% block sidebar %}{% endblock %} + +{% block content %} +
+

Bulk Data

+ +

For hackers, legal analysts and anybody else that might want + them, we provide bulk files containing all of our data. Several + types of files are available as listed below, but in general the + files that are available correspond to the major types of data we + have in our database (presently, Opinions and Oral Arguments, but + we expect this to slowly expand). +

+ +

The CiteGeist Bulk Data File

+ +

+ On the 15th of each month, we re-generate the + + CiteGeist scores + + for the entire collection. Since a single new citation can have a + ripple effect across the entire citation network, we store these + values in a flat file rather than in our database. This saves us + from having to update millions of records every month. +

+ +

+ This file can be obtained with: +

+
+ curl -O + https://www.courtlistener.com/api/bulk/external_pagerank/ +
+ +

When inspecting this file, you will find two columns of data. The + first column corresponds to the ID numbers of the items in our + opinion database, and the second value corresponds to the CiteGeist + score for that item. +

+ + +

Bulk Data Files for Opinions and Oral Arguments

+ +

+ Two types of bulk file are available for each type of content in + CourtListener. The first is a bulk file containing everything in + the system for that type of data. The second is a + jurisdiction-based file that only has the content for a certain + jurisdiction. In general, the scheme for the bulk files is as + follows: +

+
+ https://www.courtlistener.com/api/bulk-data/$data-type/$jurisdiction.tar.gz +
+ +

Some examples:

+
+
All opinions from the First Circuit of Appeals (ca1):
+
https://www.courtlistener.com/api/bulk-data/opinion/ca1.tar.gz +
+
All oral arguments from the Second Circuit of Appeals (ca2): +
+
https://www.courtlistener.com/api/bulk-data/oral-argument/ca2.tar.gz +
+
All opinions from all jurisdictions:
+
https://www.courtlistener.com/api/bulk-data/opinion/all.tar.gz +
+
+

A list of all current jurisdictions is on the right and we regularly + add new jurisdictions. To monitor for new jurisdictions, you may + want to look at the Jurisdiction + endpoint of the REST API. +

+ +

What To Expect in the Bulk Files

+ +

+ These files are generated using the REST + API and follow the schemas described there. The files inside + the tar archives have names corresponding to the ID of each item, + and are formatted as JSON. If you wish to see a sample file, we + advise selecting a small or secretive jurisdiction (such as the + FISA court) and using that to get an idea of what the bulk files + contain. +

+ +

Generation Times

+ +

+ As can be seen on the public CourtListener + maintenance calendar, + bulk data files are regenerated on the last day of every month + beginning at 3AM PST. Generation can take many hours, but in + general is expected to conclude before the 1st of each month. On + the last day of the month, we do not guarantee that you will get + either the new or old archives, as archives are updated in place as + their generation completes. In other words, on the last day of the + month, do not count on getting up-to-date information until the + next day. +

+ +

Donations

+ +

Free Law Project has been + providing bulk data for many years but is opposed to charging for + public domain data. However, if you find these files valuable to + your work and are able, we ask that you seriously consider how + much they might cost otherwise and consider making a + donation in a similar amount. Free Law + Project is a California non-profit and we rely on your support to + survive. +

+ +

Adding Features and Fixing Bugs

+

Like all Free Law Project initiatives, CourtListener is an open + source project. If you are a developer and you notice bugs or + missing features, we enthusiastically welcome your contributions + on + Github. +

+

Unfortunately, there are always more bugs than time.

+ +

Obsoleted Bulk Data APIs

+ +

In the past, bulk data files were available by day, month, or year + for every jurisdiction, and a single file was available containing + all data. Without community objection, these APIs were + sunsetted + in the fall of 2014. +

+
+
+ + +
+{% endblock %} diff --git a/alert/assets/templates/api/dumps.html b/alert/assets/templates/api/dumps.html deleted file mode 100644 index 7331c25a89..0000000000 --- a/alert/assets/templates/api/dumps.html +++ /dev/null @@ -1,151 +0,0 @@ -{% extends "base.html" %} - -{% block title %}Bulk Data - CourtListener.com{% endblock %} -{% block search-form %}{% endblock %} - -{% block sidebar %}{% endblock %} - -{% block content %} -
-

Bulk Data

- -

For hackers and legal analysts, we provide bulk files containing our data. Two types of files are available. - The first is a single file containing the CiteGeist scores for all items in our collection. The second are - XML files containing large sets of data. -

- -

The CiteGeist Bulk Data File

-

- On the 15th of each month, we re-generate the - - CiteGeist scores - - for the entire collection. Since a single new citation can have a ripple effect across the entire citation - network, we store these values in a flat file rather than in our database. This saves us from having to - update millions of records every month. -

-

- This file can be obtained with: -

-
- curl -O https://www.courtlistener.com/api/bulk/external_pagerank/ -
- - -

Bulk Data Files

-

- XML Bulk data files are available by the year, month and day. For each time period, individual bulk - data files are available for each court, as well as a single file containing data from - all courts. We generate these files the first time they are requested, so some files may be very fast, - while others may need to be created for you, which can take a moment. -

- -

The XML information in the files should be self-explanatory, but we welcome discussion - in - our developer forum (preferred) or via our contact page. -

- -

All time stamps are Pacific Standard Time.

- - -

Requesting the Bulk Data Files

- -

Annual, monthly or daily bulk data files can be accessed at - https://www.courtlistener.com/api/bulk/year/month/day/court.xml.gz.

- -

- For example, let's look at ways to access the First Circuit of Appeals (ca1): -

- - -

If you would like all cases for a given time period, - you can use all for the court name. For example, - - https://www.courtlistener.com/api/bulk/2009/06/09/all.xml.gz - returns all of the cases from June 9, 2009 (across all courts).

- -

We also provide a bulk data file of all cases up through the last day - of the previous month. To obtain this file, simply omit the date - in your query, and use all for the court name: - https://www.courtlistener.com/api/bulk/all.xml.gz. - This file is very large ({{ dump_size }}). It's currently not possible to obtain complete - data files for individual courts, due the processing required to generate such files.

- -

On the backend, bulk data files are generated when a GET request is placed on an /api/bulk/ endpoint - and once the file is generated, you are redirected to its location on our server's disk. - If the file was previously generated, you will be redirected immediately to a cached copy. - This architecture is necessary on our backend and means that consumers of this API will need - to automatically follow 302 redirects. If you are using cURL, this can be accomplished with - the -L flag, and saving binaries can be done with the -O flag. - Thus a complete GET request might look like:

- -

curl -L -O https://www.courtlistener.com/api/bulk/2009/06/09/ca9.xml.gz

- -

- If you are interested in maintaining your system in sync with our data, you should look at - our REST API, which provides resources ordered by modification date. -

- -

- Note that prior to November, 2013 these endpoints were previously located at /dump-api/. The - old location will redirect you as necessary, but note that it will eventually go away. -

- -
-
- - -
-{% endblock %} diff --git a/alert/audio/urls.py b/alert/audio/urls.py index 22427d96af..c2a267df4a 100644 --- a/alert/audio/urls.py +++ b/alert/audio/urls.py @@ -1,5 +1,3 @@ -from alert.audio.feeds import AllJurisdictionsPodcast, JurisdictionPodcast, \ - SearchPodcast from alert.audio.views import view_audio_file from alert.audio.sitemap import oral_argument_sitemap_maker from alert.urls import pacer_codes @@ -11,9 +9,9 @@ # Podcasts (r'^podcast/court/(?P' + '|'.join(pacer_codes) + ')/$', - JurisdictionPodcast()), - (r'^podcast/court/all/$', AllJurisdictionsPodcast()), - (r'^podcast/(search)/', SearchPodcast()), + 'JurisdictionPodcast()'), + (r'^podcast/court/all/$', 'AllJurisdictionsPodcast()'), + (r'^podcast/(search)/', 'SearchPodcast()'), # Sitemap (r'^sitemap-oral-arguments\.xml', oral_argument_sitemap_maker), diff --git a/alert/donate/management/commands/cl_send_donation_reminders.py b/alert/donate/management/commands/cl_send_donation_reminders.py index cc248271a0..53797c3d53 100644 --- a/alert/donate/management/commands/cl_send_donation_reminders.py +++ b/alert/donate/management/commands/cl_send_donation_reminders.py @@ -2,12 +2,10 @@ from django.core.management.base import BaseCommand from django.db.models import Sum from django.template import loader, Context -from optparse import make_option from django.utils.timezone import now from alert.search.models import Document, Court from alert.stats import Stat from alert.userHandling.models import UserProfile -from datetime import date from datetime import timedelta diff --git a/alert/dump_all_cases.py b/alert/dump_all_cases.py deleted file mode 100644 index 03fa3a649d..0000000000 --- a/alert/dump_all_cases.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import sys - -execfile('/etc/courtlistener') -sys.path.append(INSTALL_ROOT) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings") - -from alert.lib.dump_lib import dump_it_all - - -def main(): - """ - A simple function that dumps all cases to a single dump file. - """ - dump_it_all() - - exit(0) - -if __name__ == '__main__': - main() diff --git a/alert/lib/dump_lib.py b/alert/lib/dump_lib.py deleted file mode 100644 index c60ed2c672..0000000000 --- a/alert/lib/dump_lib.py +++ /dev/null @@ -1,264 +0,0 @@ -import calendar -import gzip -import time -import os - -from datetime import datetime, date -from django.utils.timezone import utc, now -from django.conf import settings -from lxml import etree -from alert.lib.db_tools import queryset_generator_by_date -from alert.search.models import Document - - -class myGzipFile(gzip.GzipFile): - """Backports Python 2.7 functionality into 2.6. - - In order to use the 'with syntax' below, I need to subclass the gzip - library here. Once all of the machines are running Python 2.7, this class - can be removed, and the 'with' code below can simply reference the gzip - class rather than this one. - - This line of code worked in 2.7: - with gzip.open(filename, mode='wb') as z_file: - """ - def __enter__(self): - if self.fileobj is None: - raise ValueError("I/O operation on closed GzipFile object") - return self - - def __exit__(self, *args): - self.close() - - -def make_dump_file(docs_to_dump, path_from_root, filename): - # This var is needed to clear out null characters and control characters - # (skipping newlines) - null_map = dict.fromkeys(range(0, 10) + range(11, 13) + range(14, 32)) - - temp_dir = str(time.time()) - - try: - os.makedirs(os.path.join(path_from_root, temp_dir)) - except OSError: - # Path exists. - pass - - with myGzipFile(os.path.join(path_from_root, temp_dir, filename), - mode='wb') as z_file: - - z_file.write('\n' + - '\n') - - for doc in docs_to_dump: - row = etree.Element("opinion") - try: - # These are required by the DB, and thus are safe - # without the try/except blocks - row.set('id', str(doc.pk)) - row.set('path', doc.get_absolute_url()) - row.set('sha1', doc.sha1) - row.set('court', doc.docket.court.full_name) - try: - row.set('download_url', doc.download_url) - except: - pass - row.set('time_retrieved', str(doc.time_retrieved)) - # All are wrapped in try/except b/c the value might not be found. - try: - row.set('date_filed', str(doc.date_filed)) - except: - pass - try: - row.set('precedential_status', doc.precedential_status) - except: - pass - try: - row.set('local_path', str(doc.local_path)) - except: - pass - try: - row.set('docket_number', doc.citation.docket_number) - except: - pass - try: - row.set('federal_cite_one', doc.citation.federal_cite_one) - except: - pass - try: - row.set('federal_cite_two', doc.citation.federal_cite_two) - except: - pass - try: - row.set('federal_cite_three', doc.citation.federal_cite_three) - except: - pass - try: - row.set('state_cite_one', doc.citation.state_cite_one) - except: - pass - try: - row.set('state_cite_two', doc.citation.state_cite_two) - except: - pass - try: - row.set('state_cite_three', doc.citation.state_cite_three) - except: - pass - try: - row.set('state_cite_regional', doc.citation.state_cite_regional) - except: - pass - try: - row.set('specialty_cite_one', doc.citation.specialty_cite_one) - except: - pass - try: - row.set('scotus_early_cite', doc.citation.scotus_early_cite) - except: - pass - try: - row.set('lexis_cite', doc.citation.lexis_cite) - except: - pass - try: - row.set('westlaw_cite', doc.citation.westlaw_cite) - except: - pass - try: - row.set('neutral_cite', doc.citation.neutral_cite) - except: - pass - try: - row.set('case_name', doc.citation.case_name) - except: - pass - try: - row.set('judges', doc.judges) - except: - pass - try: - row.set('nature_of_suit', doc.nature_of_suit) - except: - pass - try: - row.set('source', doc.get_source_display()) - except: - pass - try: - row.set('blocked', str(doc.blocked)) - except: - pass - try: - row.set('date_blocked', str(doc.date_blocked)) - except: - pass - try: - row.set('extracted_by_ocr', str(doc.extracted_by_ocr)) - except: - pass - - ids = ','.join([str(pk) for pk in doc.citation.citing_opinions.all().values_list('pk', flat=True)]) - if len(ids) > 0: - row.set('cited_by', ids) - - # Gather the doc text - if doc.html_with_citations: - row.text = doc.html_with_citations.translate(null_map) - elif doc.html_lawbox: - row.text = doc.html_lawbox - elif doc.html: - row.text = doc.html - else: - row.text = doc.plain_text.translate(null_map) - except ValueError: - # Null byte found. Punt. - continue - - z_file.write(' %s\n' % etree.tostring(row).encode('utf-8')) - - # Close things off - z_file.write('') - - # Delete the old archive, then replace it with the new one. Deleting - # shouldn't necessary according to the Python documentation, but in testing - # I'm not seeing file clobbering happen. - try: - os.remove(os.path.join(path_from_root, filename)) - except OSError: - # The file doesn't exist yet. This should only really be triggered by - # the all_cases dumper. The others shouldn't get this far. - pass - - # Move the new file to the correct location - os.rename(os.path.join(path_from_root, temp_dir, filename), - os.path.join(path_from_root, filename) + '.gz') - - # Remove the directory, but only if it's empty. - os.rmdir(os.path.join(path_from_root, temp_dir)) - - return os.path.join(path_from_root, filename) - - -def dump_it_all(): - start_date = datetime(1754, 9, 1, tzinfo=utc) # First American case - end_date = now() - # Get the documents from the database. - qs = Document.objects.all() - docs_to_dump = queryset_generator_by_date( - qs, - 'date_filed', - start_date, - end_date - ) - - path_from_root = settings.DUMP_DIR - filename = 'all.xml' - make_dump_file(docs_to_dump, path_from_root, filename) - - -def get_date_range(year, month, day): - """ Create a date range to be queried. - - Given a year and optionally a month or day, return a date range. If only a - year is given, return start date of January 1, and end date of December - 31st. Do similarly if a year and month are supplied or if all three values - are provided. - """ - # Sort out the start dates - if month is None: - start_month = 1 - else: - start_month = int(month) - if day is None: - start_day = 1 - else: - start_day = int(day) - - start_year = int(year) - start_date = date(start_year, start_month, start_day) - - annual = False - monthly = False - daily = False - # Sort out the end dates - if day is None and month is None: - # it's an annual query - annual = True - end_month = 12 - end_day = 31 - elif day is None: - # it's a month query - monthly = True - end_month = int(month) - end_day = calendar.monthrange(int(year), end_month)[1] - else: - # all three values provided! - daily = True - end_month = int(month) - end_day = int(day) - - end_year = int(year) - end_date = date(end_year, end_month, end_day) - - return start_date, end_date, annual, monthly, daily diff --git a/alert/lib/search_utils.py b/alert/lib/search_utils.py index 5f295d1039..c7b90531a7 100644 --- a/alert/lib/search_utils.py +++ b/alert/lib/search_utils.py @@ -77,8 +77,8 @@ def make_stats_variable(solr_facet_values, search_form): return facets -def merge_form_with_courts(COURTS, search_form): - """Merges the COURTS dict with the values from the search form. +def merge_form_with_courts(courts, search_form): + """Merges the courts dict with the values from the search form. Final value is like (note that order is significant): courts = { @@ -122,10 +122,10 @@ def merge_form_with_courts(COURTS, search_form): for field in search_form: if no_facets_selected: - for court in COURTS: + for court in courts: court['checked'] = True else: - for court in COURTS: + for court in courts: # We're merging two lists, so we have to do a nested loop # to find the right value. if 'court_%s' % court['pk'] == field.html_name: @@ -143,7 +143,7 @@ def merge_form_with_courts(COURTS, search_form): b_bundle = [] state_bundle = [] state_bundles = [] - for court in COURTS: + for court in courts: if court['jurisdiction'] == 'F': court['tab'] = 'federal' elif court['jurisdiction'] == 'FD': diff --git a/alert/lib/timer.py b/alert/lib/timer.py index 09e34cbc1d..c87e8edd57 100644 --- a/alert/lib/timer.py +++ b/alert/lib/timer.py @@ -1,19 +1,3 @@ -# This software and any associated files are copyright 2010 Brian Carver and -# Michael Lissner. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - import time def print_timing(func): @@ -24,6 +8,6 @@ def wrapper(*arg): t1 = time.time() res = func(*arg) t2 = time.time() - print 'Completed in %0.1f seconds' % ((t2 - t1)) + print 'Completed in %0.1f seconds.' % ((t2 - t1)) return res return wrapper diff --git a/alert/scrapers/tests.py b/alert/scrapers/tests.py index 19a5a409e8..0af4d724ad 100644 --- a/alert/scrapers/tests.py +++ b/alert/scrapers/tests.py @@ -4,17 +4,22 @@ import time from django.utils.timezone import now from alert.audio.models import Audio -from alert.lib.solr_core_admin import create_solr_core, delete_solr_core, swap_solr_core +from alert.lib.solr_core_admin import create_solr_core, delete_solr_core, \ + swap_solr_core from alert.lib.string_utils import trunc from alert.lib import sunburnt from alert.scrapers.DupChecker import DupChecker from alert.scrapers.models import urlToHash, ErrorLog from alert.scrapers.management.commands.cl_scrape_opinions import get_extension -from alert.scrapers.management.commands.cl_scrape_opinions import Command as OpinionCommand -from alert.scrapers.management.commands.cl_scrape_oral_arguments import Command as OralArgCommand -from alert.scrapers.management.commands.cl_report_scrape_status import calculate_counts, tally_errors +from alert.scrapers.management.commands.cl_scrape_opinions import \ + Command as OpinionCommand +from alert.scrapers.management.commands.cl_scrape_oral_arguments import \ + Command as OralArgCommand +from alert.scrapers.management.commands.cl_report_scrape_status import \ + calculate_counts, tally_errors from alert.scrapers.tasks import extract_from_txt -from alert.scrapers.test_assets import test_opinion_scraper, test_oral_arg_scraper +from alert.scrapers.test_assets import test_opinion_scraper, \ + test_oral_arg_scraper from alert.search.models import Citation, Court, Document, Docket from alert import settings from celery.task.sets import subtask @@ -71,7 +76,8 @@ def test_parsing_xml_oral_arg_site_to_site_object(self): self.assertEqual(len(site.case_names), 2) def test_content_extraction(self): - """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" + """Do all of the supported mimetypes get extracted to text + successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = ['supreme', @@ -81,7 +87,8 @@ def test_content_extraction(self): 'indiana', 'fidelity'] for i in range(0, len(site.case_names)): - path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) + path = os.path.join(settings.INSTALL_ROOT, 'alert', + site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) diff --git a/alert/search/api.py b/alert/search/api.py index 4a6d1633ec..8dc77ea578 100644 --- a/alert/search/api.py +++ b/alert/search/api.py @@ -6,14 +6,15 @@ from alert.lib.search_utils import build_main_query from alert.lib.string_utils import filter_invalid_XML_chars from alert.lib.sunburnt import sunburnt, SolrError -from alert.search.forms import SearchForm -from alert.search.models import Citation, Court, Document, SOURCES, DOCUMENT_STATUSES +from alert.search import forms +from alert.search.models import Citation, Court, Document, SOURCES, \ + DOCUMENT_STATUSES from alert.stats import tally_stat from django.core.cache import cache from lxml import etree from tastypie import fields, http -from tastypie.authentication import BasicAuthentication, SessionAuthentication, MultiAuthentication +from tastypie import authentication from tastypie.constants import ALL from tastypie.exceptions import BadRequest from tastypie.resources import ModelResource @@ -27,14 +28,16 @@ numerical_filters = ('exact', 'gte', 'gt', 'lte', 'lt', 'range',) -class BasicAuthenticationWithUser(BasicAuthentication): - """Wraps the BasicAuthentication class, changing the get_identifier method to provide the username instead of - essentially nothing. +class BasicAuthenticationWithUser(authentication.BasicAuthentication): + """Wraps the BasicAuthentication class, changing the get_identifier method + to provide the username instead of essentially nothing. Proposed this change in: https://github.com/toastdriven/django-tastypie/pull/1085/commits """ + def __init__(self, backend=None, realm='django-tastypie', **kwargs): - super(BasicAuthenticationWithUser, self).__init__(backend, realm, **kwargs) + super(BasicAuthenticationWithUser, self).__init__(backend, realm, + **kwargs) def get_identifier(self, request): return request.META.get('REMOTE_USER', request.user.username) @@ -50,7 +53,8 @@ def _handle_500(self, request, exception): if isinstance(exception, SolrError): solr_status_code = exception[0]['status'] error_xml = etree.fromstring(exception[1]) - solr_msg = error_xml.xpath('//lst[@name = "error"]/str[@name = "msg"]/text()')[0] + solr_msg = error_xml.xpath( + '//lst[@name = "error"]/str[@name = "msg"]/text()')[0] data = { 'error_message': "SolrError raised while interpreting your query.", 'solr_status_code': solr_status_code, @@ -62,7 +66,8 @@ def _handle_500(self, request, exception): response_class=http.HttpApplicationError ) else: - return super(ModelResourceWithFieldsFilter, self)._handle_500(request, exception) + return super(ModelResourceWithFieldsFilter, self)._handle_500( + request, exception) def alter_list_data_to_serialize(self, request, data): # Add a request_uri field @@ -71,7 +76,8 @@ def alter_list_data_to_serialize(self, request, data): return data def full_dehydrate(self, bundle, *args, **kwargs): - bundle = super(ModelResourceWithFieldsFilter, self).full_dehydrate(bundle, *args, **kwargs) + bundle = super(ModelResourceWithFieldsFilter, self).full_dehydrate( + bundle, *args, **kwargs) # bundle.obj[0]._data['citeCount'] = 0 fields = bundle.request.GET.get("fields", "") if fields: @@ -92,7 +98,8 @@ def dehydrate(self, bundle): def dispatch(self, request_type, request, **kwargs): """Simple override here to tally stats before sending off the results.""" tally_stat(self.tally_name) - return super(ModelResourceWithFieldsFilter, self).dispatch(request_type, request, **kwargs) + return super(ModelResourceWithFieldsFilter, self).dispatch( + request_type, request, **kwargs) class PerUserCacheThrottle(CacheThrottle): @@ -123,10 +130,12 @@ def should_be_throttled(self, identifier, **kwargs): # Weed out anything older than the timeframe. minimum_time = int(time.time()) - int(self.timeframe) - times_accessed = [access for access in cache.get(key) if access >= minimum_time] + times_accessed = [access for access in cache.get(key) if + access >= minimum_time] cache.set(key, times_accessed, self.expiration) - throttle_at = self.custom_throttles.get(identifier, int(self.throttle_at)) + throttle_at = self.custom_throttles.get(identifier, + int(self.throttle_at)) if len(times_accessed) >= throttle_at: # Throttle them. return True @@ -142,8 +151,9 @@ class CourtResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'jurisdiction' queryset = Court.objects.exclude(jurisdiction='T') @@ -162,16 +172,19 @@ class Meta: 'end_date': good_date_filters, 'jurisdictions': ALL, } - ordering = ['date_modified', 'start_date', 'end_date', 'position', 'jurisdiction'] + ordering = ['date_modified', 'start_date', 'end_date', 'position', + 'jurisdiction'] excludes = ['has_opinion_scraper', 'has_oral_argument_scraper'] class CitationResource(ModelResourceWithFieldsFilter): - opinion_uris = fields.ToManyField('search.api.DocumentResource', 'parent_documents') + opinion_uris = fields.ToManyField('search.api.DocumentResource', + 'parent_documents') class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) queryset = Citation.objects.all() max_limit = 20 @@ -220,15 +233,17 @@ class DocumentResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'opinion' - queryset = Document.objects.all().select_related('docket__court__pk', 'citation') + queryset = Document.objects.all().select_related('docket__court__pk', + 'citation') max_limit = 20 allowed_methods = ['get'] include_absolute_url = True - excludes = ['is_stub_document', 'cases_cited',] + excludes = ['is_stub_document', 'cases_cited', ] filtering = { 'id': ('exact',), 'time_retrieved': good_time_filters, @@ -243,7 +258,8 @@ class Meta: 'blocked': ALL, 'extracted_by_ocr': ALL, } - ordering = ['time_retrieved', 'date_modified', 'date_filed', 'date_blocked'] + ordering = ['time_retrieved', 'date_modified', 'date_filed', + 'date_blocked'] class CitedByResource(ModelResourceWithFieldsFilter): @@ -264,12 +280,15 @@ class CitedByResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'cited-by' queryset = Document.objects.all() - excludes = ('is_stub_document', 'html', 'html_lawbox', 'html_with_citations', 'plain_text',) + excludes = ( + 'is_stub_document', 'html', 'html_lawbox', 'html_with_citations', + 'plain_text',) include_absolute_url = True max_limit = 20 list_allowed_methods = ['get'] @@ -281,7 +300,8 @@ class Meta: def get_object_list(self, request): id = request.GET.get('id') if id: - return super(CitedByResource, self).get_object_list(request).filter( + return \ + super(CitedByResource, self).get_object_list(request).filter( pk=id)[0].citation.citing_opinions.all() else: # No ID field --> no results. @@ -296,7 +316,8 @@ def apply_filters(self, request, applicable_filters): """ return self.get_object_list(request) - def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): + def get_resource_uri(self, bundle_or_obj=None, + url_name='api_dispatch_list'): """Creates a URI like /api/v1/search/$id/ """ url_str = '/api/rest/%s/%s/%s/' @@ -328,12 +349,15 @@ class CitesResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'cites' queryset = Document.objects.all() - excludes = ('is_stub_document', 'html', 'html_lawbox', 'html_with_citations', 'plain_text',) + excludes = ( + 'is_stub_document', 'html', 'html_lawbox', 'html_with_citations', + 'plain_text',) include_absolute_url = True max_limit = 20 list_allowed_methods = ['get'] @@ -346,7 +370,8 @@ def get_object_list(self, request): """Get the citation associated with the document ID, then get all the items that it is cited by.""" id = request.GET.get('id') if id: - cases_cited = super(CitesResource, self).get_object_list(request).filter( + cases_cited = \ + super(CitesResource, self).get_object_list(request).filter( pk=id)[0].cases_cited.all() docs = Document.objects.filter(citation__in=cases_cited) return docs @@ -363,7 +388,8 @@ def apply_filters(self, request, applicable_filters): """ return self.get_object_list(request) - def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): + def get_resource_uri(self, bundle_or_obj=None, + url_name='api_dispatch_list'): """Creates a URI like /api/v1/search/$id/ """ url_str = '/api/rest/%s/%s/%s/' @@ -379,6 +405,7 @@ def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): class SolrList(object): """This implements a yielding list object that fetches items as they are queried.""" + def __init__(self, main_query, offset, limit, length=None): super(SolrList, self).__init__() self.main_query = main_query @@ -415,7 +442,8 @@ def __getitem__(self, item): # Pull the text snippet up a level, where tastypie can find it for result in results_si.result.docs: - result['snippet'] = '…'.join(result['solr_highlights']['text']) + result['snippet'] = '…'.join( + result['solr_highlights']['text']) # Return the results as objects, not dicts. for result in results_si.result.docs: @@ -522,8 +550,9 @@ class SearchResource(ModelResourceWithFieldsFilter): ) source = fields.CharField( attribute='source', - help_text='the source of the document, one of: %s' % ', '.join(['%s (%s)' % (t[0], t[1]) for t in - SOURCES]), + help_text='the source of the document, one of: %s' % ', '.join( + ['%s (%s)' % (t[0], t[1]) for t in + SOURCES]), null=True, ) snippet = fields.CharField( @@ -533,8 +562,9 @@ class SearchResource(ModelResourceWithFieldsFilter): ) status = fields.CharField( attribute='status', - help_text='The precedential status of document, one of: %s' % ', '.join([('stat_%s' % t[1]).replace(' ', '+') - for t in DOCUMENT_STATUSES]), + help_text='The precedential status of document, one of: %s' % ', '.join( + [('stat_%s' % t[1]).replace(' ', '+') + for t in DOCUMENT_STATUSES]), null=True, ) suit_nature = fields.CharField( @@ -553,8 +583,9 @@ class SearchResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'search' max_limit = 20 @@ -581,7 +612,8 @@ class Meta: 'score+desc', ] - def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): + def get_resource_uri(self, bundle_or_obj=None, + url_name='api_dispatch_list'): """Creates a URI like /api/v1/search/$id/ """ url_str = '/api/rest/%s/%s/%s/' @@ -603,7 +635,7 @@ def get_object_list(self, request=None, **kwargs): highlight='text' ) except KeyError: - sf = SearchForm({'q': "*:*"}) + sf = forms.SearchForm({'q': "*:*"}) if sf.is_valid(): main_query = build_main_query( sf.cleaned_data, @@ -621,23 +653,25 @@ def get_object_list(self, request=None, **kwargs): return sl def obj_get_list(self, bundle, **kwargs): - search_form = SearchForm(bundle.request.GET) + search_form = forms.SearchForm(bundle.request.GET) if search_form.is_valid(): cd = search_form.cleaned_data if cd['q'] == '': cd['q'] = '*:*' # Get everything. return self.get_object_list(bundle.request, cd=cd) else: - BadRequest("Invalid resource lookup data provided. Unable to complete your query.") + BadRequest( + "Invalid resource lookup data provided. Unable to complete your query.") def obj_get(self, bundle, **kwargs): - search_form = SearchForm(bundle.request.GET) + search_form = forms.SearchForm(bundle.request.GET) if search_form.is_valid(): cd = search_form.cleaned_data cd['q'] = 'id:%s' % kwargs['pk'] return self.get_object_list(bundle.request, cd=cd)[0] else: - BadRequest("Invalid resource lookup data provided. Unable to complete your request.") + BadRequest( + "Invalid resource lookup data provided. Unable to complete your request.") def apply_sorting(self, obj_list, options=None): """Since we're not using Django Model sorting, we just want to use our own, which is already diff --git a/alert/search/api2.py b/alert/search/api2.py index dad9b3f568..b17e11133c 100644 --- a/alert/search/api2.py +++ b/alert/search/api2.py @@ -6,16 +6,18 @@ from alert.lib.search_utils import build_main_query from alert.lib.string_utils import filter_invalid_XML_chars from alert.lib.sunburnt import sunburnt, SolrError -from alert.search.forms import SearchForm -from alert.search.models import Citation, Court, Document, SOURCES, DOCUMENT_STATUSES +from alert.search import forms +from alert.search.models import Citation, Court, Docket, Document, \ + SOURCES, DOCUMENT_STATUSES + from alert.stats import tally_stat from django.core.cache import cache from lxml import etree from tastypie import fields, http -from tastypie.authentication import BasicAuthentication, SessionAuthentication, MultiAuthentication +from tastypie import authentication from tastypie.constants import ALL -from tastypie.exceptions import BadRequest, TastypieError +from tastypie.exceptions import BadRequest from tastypie.resources import ModelResource from tastypie.throttle import CacheThrottle @@ -27,14 +29,16 @@ numerical_filters = ('exact', 'gte', 'gt', 'lte', 'lt', 'range',) -class BasicAuthenticationWithUser(BasicAuthentication): - """Wraps the BasicAuthentication class, changing the get_identifier method to provide the username instead of - essentially nothing. +class BasicAuthenticationWithUser(authentication.BasicAuthentication): + """Wraps the BasicAuthentication class, changing the get_identifier method + to provide the username instead of essentially nothing. Proposed this change in: https://github.com/toastdriven/django-tastypie/pull/1085/commits """ + def __init__(self, backend=None, realm='django-tastypie', **kwargs): - super(BasicAuthenticationWithUser, self).__init__(backend, realm, **kwargs) + super(BasicAuthenticationWithUser, self).__init__(backend, realm, + **kwargs) def get_identifier(self, request): return request.META.get('REMOTE_USER', request.user.username) @@ -50,9 +54,11 @@ def _handle_500(self, request, exception): if isinstance(exception, SolrError): solr_status_code = exception[0]['status'] error_xml = etree.fromstring(exception[1]) - solr_msg = error_xml.xpath('//lst[@name = "error"]/str[@name = "msg"]/text()')[0] + solr_msg = error_xml.xpath( + '//lst[@name = "error"]/str[@name = "msg"]/text()')[0] data = { - 'error_message': "SolrError raised while interpreting your query.", + 'error_message': "SolrError raised while interpreting your " + "query.", 'solr_status_code': solr_status_code, 'solr_msg': solr_msg, } @@ -62,14 +68,16 @@ def _handle_500(self, request, exception): response_class=http.HttpApplicationError ) else: - return super(ModelResourceWithFieldsFilter, self)._handle_500(request, exception) + return super(ModelResourceWithFieldsFilter, self)._handle_500( + request, exception) def alter_list_data_to_serialize(self, request, data): data['meta']['request_uri'] = request.get_full_path() return data def full_dehydrate(self, bundle, *args, **kwargs): - bundle = super(ModelResourceWithFieldsFilter, self).full_dehydrate(bundle, *args, **kwargs) + bundle = super(ModelResourceWithFieldsFilter, self).full_dehydrate( + bundle, *args, **kwargs) # bundle.obj[0]._data['citeCount'] = 0 fields = bundle.request.GET.get("fields", "") if fields: @@ -90,7 +98,8 @@ def dehydrate(self, bundle): def dispatch(self, request_type, request, **kwargs): """Simple override here to tally stats before sending off the results.""" tally_stat(self.tally_name) - return super(ModelResourceWithFieldsFilter, self).dispatch(request_type, request, **kwargs) + return super(ModelResourceWithFieldsFilter, self).dispatch( + request_type, request, **kwargs) class PerUserCacheThrottle(CacheThrottle): @@ -121,10 +130,12 @@ def should_be_throttled(self, identifier, **kwargs): # Weed out anything older than the timeframe. minimum_time = int(time.time()) - int(self.timeframe) - times_accessed = [access for access in cache.get(key) if access >= minimum_time] + times_accessed = [access for access in cache.get(key) if + access >= minimum_time] cache.set(key, times_accessed, self.expiration) - throttle_at = self.custom_throttles.get(identifier, int(self.throttle_at)) + throttle_at = self.custom_throttles.get(identifier, + int(self.throttle_at)) if len(times_accessed) >= throttle_at: # Throttle them. return True @@ -135,8 +146,9 @@ def should_be_throttled(self, identifier, **kwargs): class CourtResource(ModelResourceWithFieldsFilter): class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'jurisdiction' queryset = Court.objects.exclude(jurisdiction='T') @@ -156,19 +168,53 @@ class Meta: 'end_date': good_date_filters, 'jurisdictions': ALL, } - ordering = ['date_modified', 'start_date', 'end_date', 'position', 'jurisdiction'] + ordering = ['date_modified', 'start_date', 'end_date', 'position', + 'jurisdiction'] + + +class DocketResource(ModelResourceWithFieldsFilter): + court = fields.ForeignKey( + CourtResource, + 'court' + ) + + class Meta: + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) + + throttle = PerUserCacheThrottle(throttle_at=1000) + resource_name = 'docket' + queryset = Docket.objects.all() + max_limit = 20 + allowed_methods = ['get'] + include_absolute_url = True + filtering = { + 'id': ('exact',), + 'date_modified': good_time_filters, + 'court': ('exact',), + 'date_blocked': good_date_filters, + 'blocked': ALL, + } + ordering = ['date_modified', 'date_blocked'] class CitationResource(ModelResourceWithFieldsFilter): - opinion_uris = fields.ToManyField('search.api.DocumentResource', 'parent_documents') + opinion_uris = fields.ToManyField('search.api.DocumentResource', + 'parent_documents') class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) queryset = Citation.objects.all() max_limit = 20 - excludes = ['slug', ] + excludes = ['slug', ] # Why? + + +class OralArgumentResource(ModelResourceWithFieldsFilter): + pass class DocumentResource(ModelResourceWithFieldsFilter): @@ -177,9 +223,9 @@ class DocumentResource(ModelResourceWithFieldsFilter): 'citation', full=True ) - court = fields.ForeignKey( - CourtResource, - 'court' + docket = fields.ForeignKey( + DocketResource, + 'docket' ) html = fields.CharField( attribute='html', @@ -213,15 +259,16 @@ class DocumentResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'opinion' - queryset = Document.objects.all().select_related('docket__court__pk', 'citation') + queryset = Document.objects.all().select_related('docket', 'citation') max_limit = 20 allowed_methods = ['get'] include_absolute_url = True - excludes = ['is_stub_document', 'cases_cited',] + excludes = ['is_stub_document', 'cases_cited'] filtering = { 'id': ('exact',), 'time_retrieved': good_time_filters, @@ -236,7 +283,8 @@ class Meta: 'blocked': ALL, 'extracted_by_ocr': ALL, } - ordering = ['time_retrieved', 'date_modified', 'date_filed', 'date_blocked'] + ordering = ['time_retrieved', 'date_modified', 'date_filed', + 'date_blocked'] class CitedByResource(ModelResourceWithFieldsFilter): @@ -257,12 +305,15 @@ class CitedByResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'cited-by' queryset = Document.objects.all() - excludes = ('is_stub_document', 'html', 'html_lawbox', 'html_with_citations', 'plain_text',) + excludes = ( + 'is_stub_document', 'html', 'html_lawbox', 'html_with_citations', + 'plain_text',) include_absolute_url = True max_limit = 20 list_allowed_methods = ['get'] @@ -274,8 +325,9 @@ class Meta: def get_object_list(self, request): id = request.GET.get('id') if id: - return super(CitedByResource, self).get_object_list(request).filter( - pk=id)[0].citation.citing_opinions.all() + return \ + super(CitedByResource, self).get_object_list(request).filter( + pk=id)[0].citation.citing_opinions.all() else: # No ID field --> no results. return super(CitedByResource, self).get_object_list(request).none() @@ -289,7 +341,8 @@ def apply_filters(self, request, applicable_filters): """ return self.get_object_list(request) - def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): + def get_resource_uri(self, bundle_or_obj=None, + url_name='api_dispatch_list'): """Creates a URI like /api/v1/search/$id/ """ url_str = '/api/rest/%s/%s/%s/' @@ -321,12 +374,15 @@ class CitesResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'cites' queryset = Document.objects.all() - excludes = ('is_stub_document', 'html', 'html_lawbox', 'html_with_citations', 'plain_text',) + excludes = ( + 'is_stub_document', 'html', 'html_lawbox', 'html_with_citations', + 'plain_text',) include_absolute_url = True max_limit = 20 list_allowed_methods = ['get'] @@ -339,8 +395,9 @@ def get_object_list(self, request): """Get the citation associated with the document ID, then get all the items that it is cited by.""" id = request.GET.get('id') if id: - cases_cited = super(CitesResource, self).get_object_list(request).filter( - pk=id)[0].cases_cited.all() + cases_cited = \ + super(CitesResource, self).get_object_list(request).filter( + pk=id)[0].cases_cited.all() docs = Document.objects.filter(citation__in=cases_cited) return docs else: @@ -356,7 +413,8 @@ def apply_filters(self, request, applicable_filters): """ return self.get_object_list(request) - def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): + def get_resource_uri(self, bundle_or_obj=None, + url_name='api_dispatch_list'): """Creates a URI like /api/v1/search/$id/ """ url_str = '/api/rest/%s/%s/%s/' @@ -372,6 +430,7 @@ def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): class SolrList(object): """This implements a yielding list object that fetches items as they are queried.""" + def __init__(self, main_query, offset, limit, length=None): super(SolrList, self).__init__() self.main_query = main_query @@ -408,7 +467,8 @@ def __getitem__(self, item): # Pull the text snippet up a level, where tastypie can find it for result in results_si.result.docs: - result['snippet'] = '…'.join(result['solr_highlights']['text']) + result['snippet'] = '…'.join( + result['solr_highlights']['text']) # Return the results as objects, not dicts. for result in results_si.result.docs: @@ -515,8 +575,9 @@ class SearchResource(ModelResourceWithFieldsFilter): ) source = fields.CharField( attribute='source', - help_text='the source of the document, one of: %s' % ', '.join(['%s (%s)' % (t[0], t[1]) for t in - SOURCES]), + help_text='the source of the document, one of: %s' % ', '.join( + ['%s (%s)' % (t[0], t[1]) for t in + SOURCES]), null=True, ) snippet = fields.CharField( @@ -526,8 +587,9 @@ class SearchResource(ModelResourceWithFieldsFilter): ) status = fields.CharField( attribute='status', - help_text='The precedential status of document, one of: %s' % ', '.join([('stat_%s' % t[1]).replace(' ', '+') - for t in DOCUMENT_STATUSES]), + help_text='The precedential status of document, one of: %s' % ', '.join( + [('stat_%s' % t[1]).replace(' ', '+') + for t in DOCUMENT_STATUSES]), null=True, ) suit_nature = fields.CharField( @@ -546,8 +608,9 @@ class SearchResource(ModelResourceWithFieldsFilter): ) class Meta: - authentication = MultiAuthentication(BasicAuthenticationWithUser(realm="courtlistener.com"), - SessionAuthentication()) + authentication = authentication.MultiAuthentication( + BasicAuthenticationWithUser(realm="courtlistener.com"), + authentication.SessionAuthentication()) throttle = PerUserCacheThrottle(throttle_at=1000) resource_name = 'search' max_limit = 20 @@ -574,7 +637,8 @@ class Meta: 'score+desc', ] - def get_resource_uri(self, bundle_or_obj=None, url_name='api_dispatch_list'): + def get_resource_uri(self, bundle_or_obj=None, + url_name='api_dispatch_list'): """Creates a URI like /api/v1/search/$id/ """ url_str = '/api/rest/%s/%s/%s/' @@ -592,9 +656,10 @@ def get_object_list(self, request=None, **kwargs): try: main_query = build_main_query(kwargs['cd'], highlight='text') except KeyError: - sf = SearchForm({'q': "*:*"}) + sf = forms.SearchForm({'q': "*:*"}) if sf.is_valid(): - main_query = build_main_query(sf.cleaned_data, highlight='text') + main_query = build_main_query(sf.cleaned_data, + highlight='text') main_query['caller'] = 'api_search' # Use a SolrList that has a couple of the normal functions built in. @@ -606,23 +671,25 @@ def get_object_list(self, request=None, **kwargs): return sl def obj_get_list(self, bundle, **kwargs): - search_form = SearchForm(bundle.request.GET) + search_form = forms.SearchForm(bundle.request.GET) if search_form.is_valid(): cd = search_form.cleaned_data if cd['q'] == '': cd['q'] = '*:*' # Get everything. return self.get_object_list(bundle.request, cd=cd) else: - BadRequest("Invalid resource lookup data provided. Unable to complete your query.") + BadRequest( + "Invalid resource lookup data provided. Unable to complete your query.") def obj_get(self, bundle, **kwargs): - search_form = SearchForm(bundle.request.GET) + search_form = forms.SearchForm(bundle.request.GET) if search_form.is_valid(): cd = search_form.cleaned_data cd['q'] = 'id:%s' % kwargs['pk'] return self.get_object_list(bundle.request, cd=cd)[0] else: - BadRequest("Invalid resource lookup data provided. Unable to complete your request.") + BadRequest( + "Invalid resource lookup data provided. Unable to complete your request.") def apply_sorting(self, obj_list, options=None): """Since we're not using Django Model sorting, we just want to use our own, which is already diff --git a/alert/search/forms.py b/alert/search/forms.py index c3c4806e9a..3dde9a80ea 100644 --- a/alert/search/forms.py +++ b/alert/search/forms.py @@ -37,10 +37,6 @@ '%Y/%m', # '2006/10' ] -# Query the DB so we can build up check boxes for each court in use. -COURTS = Court.objects.filter(in_use=True).values( - 'pk', 'short_name', 'jurisdiction', 'has_oral_argument_scraper') - def _clean_form(request, cd): """Returns cleaned up values as a Form object. @@ -62,7 +58,9 @@ def _clean_form(request, cd): mutable_get['order_by'] = cd['order_by'] mutable_get['source'] = cd['source'] - for court in COURTS: + courts = Court.objects.filter(in_use=True).values( + 'pk', 'short_name', 'jurisdiction', 'has_oral_argument_scraper') + for court in courts: mutable_get['court_%s' % court['pk']] = cd['court_%s' % court['pk']] return SearchForm(mutable_get) @@ -202,8 +200,9 @@ def __init__(self, *args, **kwargs): names coming from the database, we need to interact directly with the fields dict. """ - - for court in COURTS: + courts = Court.objects.filter(in_use=True).values( + 'pk', 'short_name', 'jurisdiction', 'has_oral_argument_scraper') + for court in courts: self.fields['court_' + court['pk']] = forms.BooleanField( label=court['short_name'], required=False, diff --git a/alert/search/urls.py b/alert/search/urls.py index bd43b42549..c810504de9 100644 --- a/alert/search/urls.py +++ b/alert/search/urls.py @@ -1,8 +1,5 @@ from alert.search import api from alert.search import api2 -from alert.search.feeds import SearchFeed, JurisdictionFeed, \ - AllJurisdictionsFeed -from alert.search.views import show_results from alert.urls import pacer_codes from django.conf.urls import patterns, include @@ -26,16 +23,16 @@ urlpatterns = patterns('', # Search pages - (r'^$', show_results), # the home page! + (r'^$', 'alert.search.views.show_results'), # the home page! # The API (r'^api/rest/', include(v1_api.urls)), (r'^api/rest/', include(v2_api.urls)), # Feeds & Podcasts - (r'^feed/(search)/$', SearchFeed()), + (r'^feed/(search)/$', 'SearchFeed()'), # lacks URL capturing b/c it will use GET queries. - (r'^feed/court/all/$', AllJurisdictionsFeed()), + (r'^feed/court/all/$', 'AllJurisdictionsFeed()'), (r'^feed/court/(?P' + '|'.join(pacer_codes) + ')/$', - JurisdictionFeed()), + 'JurisdictionFeed()'), ) diff --git a/alert/search/views.py b/alert/search/views.py index 615431edcb..689a24f4d8 100644 --- a/alert/search/views.py +++ b/alert/search/views.py @@ -16,9 +16,9 @@ from alert.lib import search_utils from alert.lib import sunburnt from alert.lib.bot_detector import is_bot -from alert.search.forms import SearchForm, COURTS, _clean_form +from alert.search.forms import SearchForm, _clean_form from alert import settings -from alert.search.models import Document +from alert.search.models import Document, Court from alert.stats import tally_stat, Stat from audio.models import Audio @@ -49,8 +49,12 @@ def do_search(request, rows=20, order_by=None): settings.SOLR_AUDIO_URL, mode='r') status_facets = None results_si = conn.raw_query(**search_utils.build_main_query(cd)) + + courts = Court.objects.filter(in_use=True).values( + 'pk', 'short_name', 'jurisdiction', + 'has_oral_argument_scraper') courts, court_count_human, court_count = search_utils\ - .merge_form_with_courts(COURTS, search_form) + .merge_form_with_courts(courts, search_form) except Exception, e: logger.warning("Error loading search page with request: %s" % request.GET) diff --git a/alert/settings/10-public.py b/alert/settings/10-public.py index 1d7480f9cb..e110324fb9 100644 --- a/alert/settings/10-public.py +++ b/alert/settings/10-public.py @@ -158,7 +158,7 @@ STATIC_ROOT = os.path.join(INSTALL_ROOT, 'alert/assets/static/') # This is where things get collected to # Where should the data dumps be stored? -DUMP_DIR = os.path.join(INSTALL_ROOT, 'alert/assets/media/dumps/') +DUMP_DIR = os.path.join(INSTALL_ROOT, 'alert/assets/media/bulk-data/') TEMPLATE_DIRS = ( # Don't forget to use absolute paths, not relative paths. diff --git a/alert/urls.py b/alert/urls.py index 66c9674131..140936535a 100644 --- a/alert/urls.py +++ b/alert/urls.py @@ -5,7 +5,6 @@ from django.contrib import admin from django.views.generic import RedirectView - pacer_codes = Court.objects.filter(in_use=True).values_list('pk', flat=True) admin.autodiscover() diff --git a/apache/courtlistener.com.conf b/apache/courtlistener.com.conf index 9e9663b0c7..6f78b93679 100644 --- a/apache/courtlistener.com.conf +++ b/apache/courtlistener.com.conf @@ -25,7 +25,7 @@ Alias /media/ /var/www/court-listener/alert/assets/media/ Alias /static/ /var/www/court-listener/alert/assets/static/ - Alias /dumps/ /var/www/court-listener/alert/assets/media/dumps/ + Alias /api/bulk-data/ /var/www/court-listener/alert/assets/media/bulk-data/ Alias /humans.txt /var/www/court-listener/alert/humans.txt Alias /tools/free-law-machine/ /sata/vm/ Alias /tools/sample-data/ /sata/sample-data/ diff --git a/upgrade.txt b/upgrade.txt index a427067dbb..9a85981364 100644 --- a/upgrade.txt +++ b/upgrade.txt @@ -54,9 +54,6 @@ We welcome a conversion of these notes to a better process using Fabric. slug are updated elsewhere? + Case_name and slugs need to be pulled from the correct places (opinions should pull from Citation and Dockets from Docket, for example). - - favorites cannot be created correctly for some reason - - Why doesn't the button in the admin site work? It should use - get_absolute_url of Document, but it fails weirdly. - Write some kind of script to handle matching up oral args with opinions + Make audio searchable? + Make sure the processing_complete flag is triggered properly. @@ -64,19 +61,17 @@ We welcome a conversion of these notes to a better process using Fabric. - Make the audio page + Podcasts/Audio feeds - Audio alerts? - - Bulk files - - Sitemaps + + Sitemaps - selecting order drop down has weirdness on OA page. - Do citation feeds redirect properly now that we're using the PK rather than an ascii conversion? - can't click tabs in the jurisdiction picker in opinions! - add recent oral arguments to the homepage? - - Scraper: + The scrapers for audio and opinions need to be finalized and tested. - - Atom feeds for audio - Rewrite bulk files and ensure that the save and delete routines of Document and Audio properly invalidates the bulk files if necessary. - - Alerts functionality? + - verify that serve_static_files works for audio stuff (analyze it's code + and such) API Changes: - /api/rest-info/ makes reference to a number of API calls that go to v1. @@ -114,7 +109,29 @@ We welcome a conversion of these notes to a better process using Fabric. instance_dir='/usr/local/solr/example/solr/audio', ) - Install the seal-rookery. + - Bulk files have been rewritten and require some new tweaks: + - Update cron to generate bulk files on the last day of each month, + using something like the following: + + min hour 30 4,6,9,11 * manage.py cl_make_bulk_data + min hour 31 1,3,5,7,8,10,12 * manage.py cl_make_bulk_data + min hour 28 2 * manage.py cl_make_bulk_data + + - Remove any old cron entries referencing dump_all_files.py + - Add a new directory at $INSTALL_ROOT/alert/assets/media/bulk-data to + contain the new bulk files when they are created, and symlink it from + /sata/. + - Generate the new bulk files by calling manage.py cl_make_bulk_data. + - Be sure that Apache is restarted so the new location is updated. + - Delete any old bulk files located at + $INSTALL_ROOT/alert/assets/media/dumps + + + Later: + - favorites cannot be created correctly for some reason + - Why doesn't the button in the admin site work? It should use + get_absolute_url of Document, but it fails weirdly. 2014-06-18: - This update makes alert editing a lot more intuitive.