From 92e1e42eab6be3b3c9a76b630ab4d1884d967be8 Mon Sep 17 00:00:00 2001
From: Joshua Tauberer <tauberer@govtrack.us>
Date: Wed, 1 Apr 2020 15:11:40 +0000
Subject: [PATCH] new script to get total number of pages/words enacted by
 Congress, plus adding a note to the statistics page about how counting bills
 isn't interesting

---
 analysis/bill_status_totals.py      | 37 +++++++++++++++
 analysis/count_pages_of_bills.py    | 74 ++++++++++++++++++++---------
 analysis/find_recursive_acronyms.py | 38 +++++++++++++++
 analysis/sponsor_counts.py          | 18 +++++++
 templates/bill/bill_statistics.html |  1 +
 us.py                               |  6 +++
 6 files changed, 152 insertions(+), 22 deletions(-)
 create mode 100755 analysis/bill_status_totals.py
 create mode 100644 analysis/find_recursive_acronyms.py
 create mode 100755 analysis/sponsor_counts.py

diff --git a/analysis/bill_status_totals.py b/analysis/bill_status_totals.py
new file mode 100755
index 00000000..1fdc9eb5
--- /dev/null
+++ b/analysis/bill_status_totals.py
@@ -0,0 +1,37 @@
+#!script
+
+from collections import defaultdict
+import csv
+import sys
+
+from django.db.models import Count
+
+from bill.models import *
+
+# Collection congress/type/status pairs.
+data = Bill.objects.filter(congress__gte=93).values("congress", "bill_type", "current_status").annotate(count=Count('id'))
+data = list(data) # fetch all
+
+# Replace numeric bill type and status with enum value and get the domain of statuses.
+all_statuses = set()
+all_bill_types = set()
+for rec in data:
+  rec["bill_type"] = BillType.by_value(rec["bill_type"])
+  rec["current_status"] = BillStatus.by_value(rec["current_status"])
+  all_statuses.add(rec["current_status"])
+  all_bill_types.add(rec["bill_type"])
+
+# Sort statuses in our canonical order.
+all_statuses = sorted(all_statuses, key = lambda status : status.sort_order)
+
+# Form a matrix.
+matrix = defaultdict(lambda : 0)
+for rec in data:
+  matrix[(rec["congress"], rec["bill_type"], rec["current_status"])] += 1
+
+# Output.
+W = csv.writer(sys.stdout)
+W.writerow(["congress", "bill type"] + [status.key for status in all_statuses])
+for congress in range(min(rec["congress"] for rec in data), max(rec["congress"] for rec in data)+1):
+  for bill_type in all_bill_types:
+    W.writerow([congress, bill_type.label] + [matrix[(congress, bill_type, status)] for status in all_statuses])
diff --git a/analysis/count_pages_of_bills.py b/analysis/count_pages_of_bills.py
index 2b3bd2ab..e9027a7a 100755
--- a/analysis/count_pages_of_bills.py
+++ b/analysis/count_pages_of_bills.py
@@ -1,6 +1,6 @@
 #!script
 
-import sys, tqdm
+import sys, csv
 from collections import defaultdict
 
 from django.db.models import Count
@@ -10,30 +10,60 @@
 from bill.models import Bill, BillStatus
 from bill.billtext import load_bill_text
 
-def doit(congress):
-	all = defaultdict(lambda : 0)
-	enacted = defaultdict(lambda : 0)
+from us import get_congress_years
+
+from numpy import median
+
+W = csv.writer(sys.stdout)
+W.writerow([
+	"congress",
+	"years",
+	"bills",
+	"pages",
+	"words",
+	"median pages per bill",
+	"median words per bill",
+	"bills_with_missing_text",
+])
+
+def count_pages_of_bills(congress):
+	counters = defaultdict(lambda : [])
 	missing_text = 0
 
-	qs = Bill.objects.filter(congress=congress)
-	for b in tqdm.tqdm(qs, total=qs.count()):
-		try:
-			pp = load_bill_text(b, None, mods_only=True).get("numpages")
-		except IOError:
-			missing_text += 1
-			continue
-		wds = len(load_bill_text(b, None, plain_text=True).split(" "))
+	qs = Bill.objects.filter(congress=congress)\
+		.filter(current_status__in=BillStatus.final_status_enacted_bill)
+	for b in qs:
+		plain_text = load_bill_text(b, None, plain_text=True)
+
+		if congress >= 103:
+			# Bills since 1993 have GPO MODS XML metadata with page counts.
+			try:
+				pp = load_bill_text(b, None, mods_only=True).get("numpages")
+			except IOError:
+				missing_text += 1
+				continue
+			if pp is None:
+				missing_text += 1
+				continue
+		else:
+			# For historical statutes we only have plain text from the
+			# Statutes at Large, extracted from PDFs. We can get page
+			# counts by looking for our replacement of the form feed
+			# character put in by pdftotext. We only have that when
+			# we extracted text from PDFs, which we only did for
+			# the Statutes at Large. We can't do this on modern bills
+			# where the text came from GPO plain text format.
+			pp = len([pgtext for pgtext in plain_text.split("\n=============================================\n") if pgtext.strip() != ""])
 
-		all["count"] += 1
-		all["pages"] += pp
-		all["words"] += wds
-		if b.current_status in BillStatus.final_status_enacted_bill:
-			enacted["count"] += 1
-			enacted["pages"] += pp
-			enacted["words"] += wds
+		wds = len(plain_text.split(" "))
 
+		counters["pages"].append(pp)
+		counters["words"].append(wds)
 
-	print(congress, all["count"], all["pages"], all["words"], enacted["count"], enacted["pages"], enacted["words"])
-	print("\t", missing_text, "missing text")
+	W.writerow([congress, "{}-{}".format(*get_congress_years(congress)),
+		len(counters["pages"]), sum(counters["pages"]), sum(counters["words"]),
+		int(round(median(counters["pages"]))), int(round(median(counters["words"]))),
+		missing_text])
 
-doit(114)
+for c in range(82, CURRENT_CONGRESS+1):
+	count_pages_of_bills(c)
diff --git a/analysis/find_recursive_acronyms.py b/analysis/find_recursive_acronyms.py
new file mode 100644
index 00000000..009ac2a1
--- /dev/null
+++ b/analysis/find_recursive_acronyms.py
@@ -0,0 +1,38 @@
+import glob
+import json
+import re
+
+# Iterate over all bills...
+for fn in glob.glob("data/congress/11*/bills/*/*/data.json"):
+  with open(fn) as f:
+    bill = json.load(f)
+  for title in bill.get("titles", []):
+    title = title["title"]
+
+    # Okay, now the fun part...
+
+    # Does it start with a two-or-more capital letter sequence + space?
+    m = re.match(r"^([A-Z]{2,})(.*?)( Act(?: of \d\d\d\d)?)?$", title)
+    if not m: continue
+    acronym, remainder, act_of_year = m.groups()
+    remainder = remainder.strip()
+
+    # The remainder must be at least as long as the acronym (after the first letter).
+    if len(remainder) <= len(acronym)-1: continue
+
+    # Does the potential acronym match the remainder of the title?
+    # Each letter in the acronym, after the first (which matches the
+    # acronym itself, if it's recursive) must match another letter
+    # in the title. Normally it must match on capital letters, but
+    # that's too strict. Every capital letter in the title must match,
+    # and other lowercase letters and the "A" in "Act (of YYYY)" may
+    # also be used to match.
+    remainder_re = re.split("([A-Z])", remainder)
+    remainder_re = [r for r in remainder_re if len(r.strip()) > 0]
+    remainder_re = "".join(
+      r if re.match("[A-Z]$", r)
+      else "[" + "".join(re.escape(c) for c in r if c != " ") + "]*"
+      for r in remainder_re
+    )
+    if re.match("^" + remainder_re + "A?$", acronym[1:], re.I):
+      print(title)
diff --git a/analysis/sponsor_counts.py b/analysis/sponsor_counts.py
new file mode 100755
index 00000000..cf1faef0
--- /dev/null
+++ b/analysis/sponsor_counts.py
@@ -0,0 +1,18 @@
+#!script
+
+from django.db.models import Count
+from bill.models import Bill
+import csv, sys
+
+counts = { }
+for b in Bill.objects.filter(
+    #introduced_date__gte="2019-01-01"
+    introduced_date__gte="2017-01-01", introduced_date__lte="2017-08-06"
+  ).only("sponsor"):
+  counts[b.sponsor] = counts.get(b.sponsor, 0) + 1
+
+counts = sorted([[v, k] for (k, v) in counts.items()], key = lambda kv : -kv[0])
+
+W = csv.writer(sys.stdout)
+for count, person in counts:
+  W.writerow([person, count])
diff --git a/templates/bill/bill_statistics.html b/templates/bill/bill_statistics.html
index 7b8cf9ca..eceab534 100644
--- a/templates/bill/bill_statistics.html
+++ b/templates/bill/bill_statistics.html
@@ -86,6 +86,7 @@ <h1>Statistics and Historical Comparison</h1>
 	<h2>Bills by Final Status</h2>
 	
 		<p>This table breaks down the bills and resolutions introduced in each two-year Congress by their final status. Note that the current Congress is not yet finished.</p>
+		<p>Since World War II (the earliest we have data), Congress has typically enacted 4-6 million words of new law in each two-year Congress. However, those words have been enacted in fewer but larger bills. Therefore, the generally decreasing number of bills enacted into law does not reflect less legislative work is occurring.</p>
 
 		<p style="text-align: center">
 			<a href="#" onclick="return show_stats_style(true, true)">Counts &amp; Percents</a>
diff --git a/us.py b/us.py
index 10d56782..5872b5f1 100644
--- a/us.py
+++ b/us.py
@@ -54,6 +54,12 @@ def get_congress_dates(congressnumber):
         CONGRESS_DATES.update(cd)
     return CONGRESS_DATES[congressnumber]
 
+def get_congress_years(congressnumber):
+    dates = get_congress_dates(congressnumber)
+    years = dates[0].year, dates[1].year
+    if dates[1].month == 1: years = (years[0], years[1]-1)
+    return years
+
 def get_session_from_date(when, allow_start_date=True, allow_end_date=True, congress=None):
     global SESSION_DATES
     if SESSION_DATES == [ ]: