Skip to content

Commit

Permalink
new script to get total number of pages/words enacted by Congress, pl…
Browse files Browse the repository at this point in the history
…us adding a note to the statistics page about how counting bills isn't interesting
  • Loading branch information
JoshData committed Apr 1, 2020
1 parent 0bdb9ad commit 92e1e42
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 22 deletions.
37 changes: 37 additions & 0 deletions analysis/bill_status_totals.py
@@ -0,0 +1,37 @@
#!script

from collections import defaultdict
import csv
import sys

from django.db.models import Count

from bill.models import *

# Collection congress/type/status pairs.
data = Bill.objects.filter(congress__gte=93).values("congress", "bill_type", "current_status").annotate(count=Count('id'))
data = list(data) # fetch all

# Replace numeric bill type and status with enum value and get the domain of statuses.
all_statuses = set()
all_bill_types = set()
for rec in data:
rec["bill_type"] = BillType.by_value(rec["bill_type"])
rec["current_status"] = BillStatus.by_value(rec["current_status"])
all_statuses.add(rec["current_status"])
all_bill_types.add(rec["bill_type"])

# Sort statuses in our canonical order.
all_statuses = sorted(all_statuses, key = lambda status : status.sort_order)

# Form a matrix.
matrix = defaultdict(lambda : 0)
for rec in data:
matrix[(rec["congress"], rec["bill_type"], rec["current_status"])] += 1

# Output.
W = csv.writer(sys.stdout)
W.writerow(["congress", "bill type"] + [status.key for status in all_statuses])
for congress in range(min(rec["congress"] for rec in data), max(rec["congress"] for rec in data)+1):
for bill_type in all_bill_types:
W.writerow([congress, bill_type.label] + [matrix[(congress, bill_type, status)] for status in all_statuses])
74 changes: 52 additions & 22 deletions analysis/count_pages_of_bills.py
@@ -1,6 +1,6 @@
#!script

import sys, tqdm
import sys, csv
from collections import defaultdict

from django.db.models import Count
Expand All @@ -10,30 +10,60 @@
from bill.models import Bill, BillStatus
from bill.billtext import load_bill_text

def doit(congress):
all = defaultdict(lambda : 0)
enacted = defaultdict(lambda : 0)
from us import get_congress_years

from numpy import median

W = csv.writer(sys.stdout)
W.writerow([
"congress",
"years",
"bills",
"pages",
"words",
"median pages per bill",
"median words per bill",
"bills_with_missing_text",
])

def count_pages_of_bills(congress):
counters = defaultdict(lambda : [])
missing_text = 0

qs = Bill.objects.filter(congress=congress)
for b in tqdm.tqdm(qs, total=qs.count()):
try:
pp = load_bill_text(b, None, mods_only=True).get("numpages")
except IOError:
missing_text += 1
continue
wds = len(load_bill_text(b, None, plain_text=True).split(" "))
qs = Bill.objects.filter(congress=congress)\
.filter(current_status__in=BillStatus.final_status_enacted_bill)
for b in qs:
plain_text = load_bill_text(b, None, plain_text=True)

if congress >= 103:
# Bills since 1993 have GPO MODS XML metadata with page counts.
try:
pp = load_bill_text(b, None, mods_only=True).get("numpages")
except IOError:
missing_text += 1
continue
if pp is None:
missing_text += 1
continue
else:
# For historical statutes we only have plain text from the
# Statutes at Large, extracted from PDFs. We can get page
# counts by looking for our replacement of the form feed
# character put in by pdftotext. We only have that when
# we extracted text from PDFs, which we only did for
# the Statutes at Large. We can't do this on modern bills
# where the text came from GPO plain text format.
pp = len([pgtext for pgtext in plain_text.split("\n=============================================\n") if pgtext.strip() != ""])

all["count"] += 1
all["pages"] += pp
all["words"] += wds
if b.current_status in BillStatus.final_status_enacted_bill:
enacted["count"] += 1
enacted["pages"] += pp
enacted["words"] += wds
wds = len(plain_text.split(" "))

counters["pages"].append(pp)
counters["words"].append(wds)

print(congress, all["count"], all["pages"], all["words"], enacted["count"], enacted["pages"], enacted["words"])
print("\t", missing_text, "missing text")
W.writerow([congress, "{}-{}".format(*get_congress_years(congress)),
len(counters["pages"]), sum(counters["pages"]), sum(counters["words"]),
int(round(median(counters["pages"]))), int(round(median(counters["words"]))),
missing_text])

doit(114)
for c in range(82, CURRENT_CONGRESS+1):
count_pages_of_bills(c)
38 changes: 38 additions & 0 deletions analysis/find_recursive_acronyms.py
@@ -0,0 +1,38 @@
import glob
import json
import re

# Iterate over all bills...
for fn in glob.glob("data/congress/11*/bills/*/*/data.json"):
with open(fn) as f:
bill = json.load(f)
for title in bill.get("titles", []):
title = title["title"]

# Okay, now the fun part...

# Does it start with a two-or-more capital letter sequence + space?
m = re.match(r"^([A-Z]{2,})(.*?)( Act(?: of \d\d\d\d)?)?$", title)
if not m: continue
acronym, remainder, act_of_year = m.groups()
remainder = remainder.strip()

# The remainder must be at least as long as the acronym (after the first letter).
if len(remainder) <= len(acronym)-1: continue

# Does the potential acronym match the remainder of the title?
# Each letter in the acronym, after the first (which matches the
# acronym itself, if it's recursive) must match another letter
# in the title. Normally it must match on capital letters, but
# that's too strict. Every capital letter in the title must match,
# and other lowercase letters and the "A" in "Act (of YYYY)" may
# also be used to match.
remainder_re = re.split("([A-Z])", remainder)
remainder_re = [r for r in remainder_re if len(r.strip()) > 0]
remainder_re = "".join(
r if re.match("[A-Z]$", r)
else "[" + "".join(re.escape(c) for c in r if c != " ") + "]*"
for r in remainder_re
)
if re.match("^" + remainder_re + "A?$", acronym[1:], re.I):
print(title)
18 changes: 18 additions & 0 deletions analysis/sponsor_counts.py
@@ -0,0 +1,18 @@
#!script

from django.db.models import Count
from bill.models import Bill
import csv, sys

counts = { }
for b in Bill.objects.filter(
#introduced_date__gte="2019-01-01"
introduced_date__gte="2017-01-01", introduced_date__lte="2017-08-06"
).only("sponsor"):
counts[b.sponsor] = counts.get(b.sponsor, 0) + 1

counts = sorted([[v, k] for (k, v) in counts.items()], key = lambda kv : -kv[0])

W = csv.writer(sys.stdout)
for count, person in counts:
W.writerow([person, count])
1 change: 1 addition & 0 deletions templates/bill/bill_statistics.html
Expand Up @@ -86,6 +86,7 @@ <h1>Statistics and Historical Comparison</h1>
<h2>Bills by Final Status</h2>

<p>This table breaks down the bills and resolutions introduced in each two-year Congress by their final status. Note that the current Congress is not yet finished.</p>
<p>Since World War II (the earliest we have data), Congress has typically enacted 4-6 million words of new law in each two-year Congress. However, those words have been enacted in fewer but larger bills. Therefore, the generally decreasing number of bills enacted into law does not reflect less legislative work is occurring.</p>

<p style="text-align: center">
<a href="#" onclick="return show_stats_style(true, true)">Counts &amp; Percents</a>
Expand Down
6 changes: 6 additions & 0 deletions us.py
Expand Up @@ -54,6 +54,12 @@ def get_congress_dates(congressnumber):
CONGRESS_DATES.update(cd)
return CONGRESS_DATES[congressnumber]

def get_congress_years(congressnumber):
dates = get_congress_dates(congressnumber)
years = dates[0].year, dates[1].year
if dates[1].month == 1: years = (years[0], years[1]-1)
return years

def get_session_from_date(when, allow_start_date=True, allow_end_date=True, congress=None):
global SESSION_DATES
if SESSION_DATES == [ ]:
Expand Down

0 comments on commit 92e1e42

Please sign in to comment.