From 92e1e42eab6be3b3c9a76b630ab4d1884d967be8 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 1 Apr 2020 15:11:40 +0000 Subject: [PATCH] new script to get total number of pages/words enacted by Congress, plus adding a note to the statistics page about how counting bills isn't interesting --- analysis/bill_status_totals.py | 37 +++++++++++++++ analysis/count_pages_of_bills.py | 74 ++++++++++++++++++++--------- analysis/find_recursive_acronyms.py | 38 +++++++++++++++ analysis/sponsor_counts.py | 18 +++++++ templates/bill/bill_statistics.html | 1 + us.py | 6 +++ 6 files changed, 152 insertions(+), 22 deletions(-) create mode 100755 analysis/bill_status_totals.py create mode 100644 analysis/find_recursive_acronyms.py create mode 100755 analysis/sponsor_counts.py diff --git a/analysis/bill_status_totals.py b/analysis/bill_status_totals.py new file mode 100755 index 00000000..1fdc9eb5 --- /dev/null +++ b/analysis/bill_status_totals.py @@ -0,0 +1,37 @@ +#!script + +from collections import defaultdict +import csv +import sys + +from django.db.models import Count + +from bill.models import * + +# Collection congress/type/status pairs. +data = Bill.objects.filter(congress__gte=93).values("congress", "bill_type", "current_status").annotate(count=Count('id')) +data = list(data) # fetch all + +# Replace numeric bill type and status with enum value and get the domain of statuses. +all_statuses = set() +all_bill_types = set() +for rec in data: + rec["bill_type"] = BillType.by_value(rec["bill_type"]) + rec["current_status"] = BillStatus.by_value(rec["current_status"]) + all_statuses.add(rec["current_status"]) + all_bill_types.add(rec["bill_type"]) + +# Sort statuses in our canonical order. +all_statuses = sorted(all_statuses, key = lambda status : status.sort_order) + +# Form a matrix. +matrix = defaultdict(lambda : 0) +for rec in data: + matrix[(rec["congress"], rec["bill_type"], rec["current_status"])] += 1 + +# Output. +W = csv.writer(sys.stdout) +W.writerow(["congress", "bill type"] + [status.key for status in all_statuses]) +for congress in range(min(rec["congress"] for rec in data), max(rec["congress"] for rec in data)+1): + for bill_type in all_bill_types: + W.writerow([congress, bill_type.label] + [matrix[(congress, bill_type, status)] for status in all_statuses]) diff --git a/analysis/count_pages_of_bills.py b/analysis/count_pages_of_bills.py index 2b3bd2ab..e9027a7a 100755 --- a/analysis/count_pages_of_bills.py +++ b/analysis/count_pages_of_bills.py @@ -1,6 +1,6 @@ #!script -import sys, tqdm +import sys, csv from collections import defaultdict from django.db.models import Count @@ -10,30 +10,60 @@ from bill.models import Bill, BillStatus from bill.billtext import load_bill_text -def doit(congress): - all = defaultdict(lambda : 0) - enacted = defaultdict(lambda : 0) +from us import get_congress_years + +from numpy import median + +W = csv.writer(sys.stdout) +W.writerow([ + "congress", + "years", + "bills", + "pages", + "words", + "median pages per bill", + "median words per bill", + "bills_with_missing_text", +]) + +def count_pages_of_bills(congress): + counters = defaultdict(lambda : []) missing_text = 0 - qs = Bill.objects.filter(congress=congress) - for b in tqdm.tqdm(qs, total=qs.count()): - try: - pp = load_bill_text(b, None, mods_only=True).get("numpages") - except IOError: - missing_text += 1 - continue - wds = len(load_bill_text(b, None, plain_text=True).split(" ")) + qs = Bill.objects.filter(congress=congress)\ + .filter(current_status__in=BillStatus.final_status_enacted_bill) + for b in qs: + plain_text = load_bill_text(b, None, plain_text=True) + + if congress >= 103: + # Bills since 1993 have GPO MODS XML metadata with page counts. + try: + pp = load_bill_text(b, None, mods_only=True).get("numpages") + except IOError: + missing_text += 1 + continue + if pp is None: + missing_text += 1 + continue + else: + # For historical statutes we only have plain text from the + # Statutes at Large, extracted from PDFs. We can get page + # counts by looking for our replacement of the form feed + # character put in by pdftotext. We only have that when + # we extracted text from PDFs, which we only did for + # the Statutes at Large. We can't do this on modern bills + # where the text came from GPO plain text format. + pp = len([pgtext for pgtext in plain_text.split("\n=============================================\n") if pgtext.strip() != ""]) - all["count"] += 1 - all["pages"] += pp - all["words"] += wds - if b.current_status in BillStatus.final_status_enacted_bill: - enacted["count"] += 1 - enacted["pages"] += pp - enacted["words"] += wds + wds = len(plain_text.split(" ")) + counters["pages"].append(pp) + counters["words"].append(wds) - print(congress, all["count"], all["pages"], all["words"], enacted["count"], enacted["pages"], enacted["words"]) - print("\t", missing_text, "missing text") + W.writerow([congress, "{}-{}".format(*get_congress_years(congress)), + len(counters["pages"]), sum(counters["pages"]), sum(counters["words"]), + int(round(median(counters["pages"]))), int(round(median(counters["words"]))), + missing_text]) -doit(114) +for c in range(82, CURRENT_CONGRESS+1): + count_pages_of_bills(c) diff --git a/analysis/find_recursive_acronyms.py b/analysis/find_recursive_acronyms.py new file mode 100644 index 00000000..009ac2a1 --- /dev/null +++ b/analysis/find_recursive_acronyms.py @@ -0,0 +1,38 @@ +import glob +import json +import re + +# Iterate over all bills... +for fn in glob.glob("data/congress/11*/bills/*/*/data.json"): + with open(fn) as f: + bill = json.load(f) + for title in bill.get("titles", []): + title = title["title"] + + # Okay, now the fun part... + + # Does it start with a two-or-more capital letter sequence + space? + m = re.match(r"^([A-Z]{2,})(.*?)( Act(?: of \d\d\d\d)?)?$", title) + if not m: continue + acronym, remainder, act_of_year = m.groups() + remainder = remainder.strip() + + # The remainder must be at least as long as the acronym (after the first letter). + if len(remainder) <= len(acronym)-1: continue + + # Does the potential acronym match the remainder of the title? + # Each letter in the acronym, after the first (which matches the + # acronym itself, if it's recursive) must match another letter + # in the title. Normally it must match on capital letters, but + # that's too strict. Every capital letter in the title must match, + # and other lowercase letters and the "A" in "Act (of YYYY)" may + # also be used to match. + remainder_re = re.split("([A-Z])", remainder) + remainder_re = [r for r in remainder_re if len(r.strip()) > 0] + remainder_re = "".join( + r if re.match("[A-Z]$", r) + else "[" + "".join(re.escape(c) for c in r if c != " ") + "]*" + for r in remainder_re + ) + if re.match("^" + remainder_re + "A?$", acronym[1:], re.I): + print(title) diff --git a/analysis/sponsor_counts.py b/analysis/sponsor_counts.py new file mode 100755 index 00000000..cf1faef0 --- /dev/null +++ b/analysis/sponsor_counts.py @@ -0,0 +1,18 @@ +#!script + +from django.db.models import Count +from bill.models import Bill +import csv, sys + +counts = { } +for b in Bill.objects.filter( + #introduced_date__gte="2019-01-01" + introduced_date__gte="2017-01-01", introduced_date__lte="2017-08-06" + ).only("sponsor"): + counts[b.sponsor] = counts.get(b.sponsor, 0) + 1 + +counts = sorted([[v, k] for (k, v) in counts.items()], key = lambda kv : -kv[0]) + +W = csv.writer(sys.stdout) +for count, person in counts: + W.writerow([person, count]) diff --git a/templates/bill/bill_statistics.html b/templates/bill/bill_statistics.html index 7b8cf9ca..eceab534 100644 --- a/templates/bill/bill_statistics.html +++ b/templates/bill/bill_statistics.html @@ -86,6 +86,7 @@

Statistics and Historical Comparison

Bills by Final Status

This table breaks down the bills and resolutions introduced in each two-year Congress by their final status. Note that the current Congress is not yet finished.

+

Since World War II (the earliest we have data), Congress has typically enacted 4-6 million words of new law in each two-year Congress. However, those words have been enacted in fewer but larger bills. Therefore, the generally decreasing number of bills enacted into law does not reflect less legislative work is occurring.

Counts & Percents diff --git a/us.py b/us.py index 10d56782..5872b5f1 100644 --- a/us.py +++ b/us.py @@ -54,6 +54,12 @@ def get_congress_dates(congressnumber): CONGRESS_DATES.update(cd) return CONGRESS_DATES[congressnumber] +def get_congress_years(congressnumber): + dates = get_congress_dates(congressnumber) + years = dates[0].year, dates[1].year + if dates[1].month == 1: years = (years[0], years[1]-1) + return years + def get_session_from_date(when, allow_start_date=True, allow_end_date=True, congress=None): global SESSION_DATES if SESSION_DATES == [ ]: