new script to get total number of pages/words enacted by Congress, pl…

…us adding a note to the statistics page about how counting bills isn't interesting
govtrack · Apr 1, 2020 · 92e1e42 · 92e1e42
1 parent 0bdb9ad
commit 92e1e42
Show file tree

Hide file tree

Showing 6 changed files with 152 additions and 22 deletions.
diff --git a/analysis/bill_status_totals.py b/analysis/bill_status_totals.py
@@ -0,0 +1,37 @@
+#!script
+
+from collections import defaultdict
+import csv
+import sys
+
+from django.db.models import Count
+
+from bill.models import *
+
+# Collection congress/type/status pairs.
+data = Bill.objects.filter(congress__gte=93).values("congress", "bill_type", "current_status").annotate(count=Count('id'))
+data = list(data) # fetch all
+
+# Replace numeric bill type and status with enum value and get the domain of statuses.
+all_statuses = set()
+all_bill_types = set()
+for rec in data:
+  rec["bill_type"] = BillType.by_value(rec["bill_type"])
+  rec["current_status"] = BillStatus.by_value(rec["current_status"])
+  all_statuses.add(rec["current_status"])
+  all_bill_types.add(rec["bill_type"])
+
+# Sort statuses in our canonical order.
+all_statuses = sorted(all_statuses, key = lambda status : status.sort_order)
+
+# Form a matrix.
+matrix = defaultdict(lambda : 0)
+for rec in data:
+  matrix[(rec["congress"], rec["bill_type"], rec["current_status"])] += 1
+
+# Output.
+W = csv.writer(sys.stdout)
+W.writerow(["congress", "bill type"] + [status.key for status in all_statuses])
+for congress in range(min(rec["congress"] for rec in data), max(rec["congress"] for rec in data)+1):
+  for bill_type in all_bill_types:
+    W.writerow([congress, bill_type.label] + [matrix[(congress, bill_type, status)] for status in all_statuses])
diff --git a/analysis/count_pages_of_bills.py b/analysis/count_pages_of_bills.py
@@ -1,6 +1,6 @@
 #!script
 
-import sys, tqdm
+import sys, csv
 from collections import defaultdict
 
 from django.db.models import Count
@@ -10,30 +10,60 @@
 from bill.models import Bill, BillStatus
 from bill.billtext import load_bill_text
 
-def doit(congress):
-	all = defaultdict(lambda : 0)
-	enacted = defaultdict(lambda : 0)
+from us import get_congress_years
+
+from numpy import median
+
+W = csv.writer(sys.stdout)
+W.writerow([
+	"congress",
+	"years",
+	"bills",
+	"pages",
+	"words",
+	"median pages per bill",
+	"median words per bill",
+	"bills_with_missing_text",
+])
+
+def count_pages_of_bills(congress):
+	counters = defaultdict(lambda : [])
 	missing_text = 0
 
-	qs = Bill.objects.filter(congress=congress)
-	for b in tqdm.tqdm(qs, total=qs.count()):
-		try:
-			pp = load_bill_text(b, None, mods_only=True).get("numpages")
-		except IOError:
-			missing_text += 1
-			continue
-		wds = len(load_bill_text(b, None, plain_text=True).split(" "))
+	qs = Bill.objects.filter(congress=congress)\
+		.filter(current_status__in=BillStatus.final_status_enacted_bill)
+	for b in qs:
+		plain_text = load_bill_text(b, None, plain_text=True)
+
+		if congress >= 103:
+			# Bills since 1993 have GPO MODS XML metadata with page counts.
+			try:
+				pp = load_bill_text(b, None, mods_only=True).get("numpages")
+			except IOError:
+				missing_text += 1
+				continue
+			if pp is None:
+				missing_text += 1
+				continue
+		else:
+			# For historical statutes we only have plain text from the
+			# Statutes at Large, extracted from PDFs. We can get page
+			# counts by looking for our replacement of the form feed
+			# character put in by pdftotext. We only have that when
+			# we extracted text from PDFs, which we only did for
+			# the Statutes at Large. We can't do this on modern bills
+			# where the text came from GPO plain text format.
+			pp = len([pgtext for pgtext in plain_text.split("\n=============================================\n") if pgtext.strip() != ""])
 
-		all["count"] += 1
-		all["pages"] += pp
-		all["words"] += wds
-		if b.current_status in BillStatus.final_status_enacted_bill:
-			enacted["count"] += 1
-			enacted["pages"] += pp
-			enacted["words"] += wds
+		wds = len(plain_text.split(" "))
 
+		counters["pages"].append(pp)
+		counters["words"].append(wds)
 
-	print(congress, all["count"], all["pages"], all["words"], enacted["count"], enacted["pages"], enacted["words"])
-	print("\t", missing_text, "missing text")
+	W.writerow([congress, "{}-{}".format(*get_congress_years(congress)),
+		len(counters["pages"]), sum(counters["pages"]), sum(counters["words"]),
+		int(round(median(counters["pages"]))), int(round(median(counters["words"]))),
+		missing_text])
 
-doit(114)
+for c in range(82, CURRENT_CONGRESS+1):
+	count_pages_of_bills(c)
diff --git a/analysis/find_recursive_acronyms.py b/analysis/find_recursive_acronyms.py
@@ -0,0 +1,38 @@
+import glob
+import json
+import re
+
+# Iterate over all bills...
+for fn in glob.glob("data/congress/11*/bills/*/*/data.json"):
+  with open(fn) as f:
+    bill = json.load(f)
+  for title in bill.get("titles", []):
+    title = title["title"]
+
+    # Okay, now the fun part...
+
+    # Does it start with a two-or-more capital letter sequence + space?
+    m = re.match(r"^([A-Z]{2,})(.*?)( Act(?: of \d\d\d\d)?)?$", title)
+    if not m: continue
+    acronym, remainder, act_of_year = m.groups()
+    remainder = remainder.strip()
+
+    # The remainder must be at least as long as the acronym (after the first letter).
+    if len(remainder) <= len(acronym)-1: continue
+
+    # Does the potential acronym match the remainder of the title?
+    # Each letter in the acronym, after the first (which matches the
+    # acronym itself, if it's recursive) must match another letter
+    # in the title. Normally it must match on capital letters, but
+    # that's too strict. Every capital letter in the title must match,
+    # and other lowercase letters and the "A" in "Act (of YYYY)" may
+    # also be used to match.
+    remainder_re = re.split("([A-Z])", remainder)
+    remainder_re = [r for r in remainder_re if len(r.strip()) > 0]
+    remainder_re = "".join(
+      r if re.match("[A-Z]$", r)
+      else "[" + "".join(re.escape(c) for c in r if c != " ") + "]*"
+      for r in remainder_re
+    )
+    if re.match("^" + remainder_re + "A?$", acronym[1:], re.I):
+      print(title)
diff --git a/analysis/sponsor_counts.py b/analysis/sponsor_counts.py
@@ -0,0 +1,18 @@
+#!script
+
+from django.db.models import Count
+from bill.models import Bill
+import csv, sys
+
+counts = { }
+for b in Bill.objects.filter(
+    #introduced_date__gte="2019-01-01"
+    introduced_date__gte="2017-01-01", introduced_date__lte="2017-08-06"
+  ).only("sponsor"):
+  counts[b.sponsor] = counts.get(b.sponsor, 0) + 1
+
+counts = sorted([[v, k] for (k, v) in counts.items()], key = lambda kv : -kv[0])
+
+W = csv.writer(sys.stdout)
+for count, person in counts:
+  W.writerow([person, count])
diff --git a/templates/bill/bill_statistics.html b/templates/bill/bill_statistics.html
@@ -86,6 +86,7 @@ <h1>Statistics and Historical Comparison</h1>
 	<h2>Bills by Final Status</h2>
 
 		<p>This table breaks down the bills and resolutions introduced in each two-year Congress by their final status. Note that the current Congress is not yet finished.</p>
+		<p>Since World War II (the earliest we have data), Congress has typically enacted 4-6 million words of new law in each two-year Congress. However, those words have been enacted in fewer but larger bills. Therefore, the generally decreasing number of bills enacted into law does not reflect less legislative work is occurring.</p>
 
 		<p style="text-align: center">
 			<a href="#" onclick="return show_stats_style(true, true)">Counts &amp; Percents</a>

diff --git a/us.py b/us.py
@@ -54,6 +54,12 @@ def get_congress_dates(congressnumber):
         CONGRESS_DATES.update(cd)
     return CONGRESS_DATES[congressnumber]
 
+def get_congress_years(congressnumber):
+    dates = get_congress_dates(congressnumber)
+    years = dates[0].year, dates[1].year
+    if dates[1].month == 1: years = (years[0], years[1]-1)
+    return years
+
 def get_session_from_date(when, allow_start_date=True, allow_end_date=True, congress=None):
     global SESSION_DATES
     if SESSION_DATES == [ ]: