/
count_pages_of_bills.py
executable file
·69 lines (55 loc) · 1.92 KB
/
count_pages_of_bills.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!script
import sys, csv
from collections import defaultdict
from django.db.models import Count
from settings import CURRENT_CONGRESS
from bill.models import Bill, BillStatus
from bill.billtext import load_bill_text
from us import get_congress_years
from numpy import median
W = csv.writer(sys.stdout)
W.writerow([
"congress",
"years",
"bills",
"pages",
"words",
"median pages per bill",
"median words per bill",
"bills_with_missing_text",
])
def count_pages_of_bills(congress):
counters = defaultdict(lambda : [])
missing_text = 0
qs = Bill.objects.filter(congress=congress)\
.filter(current_status__in=BillStatus.final_status_enacted_bill)
for b in qs:
plain_text = load_bill_text(b, None, plain_text=True)
if congress >= 103:
# Bills since 1993 have GPO MODS XML metadata with page counts.
try:
pp = load_bill_text(b, None, mods_only=True).get("numpages")
except IOError:
missing_text += 1
continue
if pp is None:
missing_text += 1
continue
else:
# For historical statutes we only have plain text from the
# Statutes at Large, extracted from PDFs. We can get page
# counts by looking for our replacement of the form feed
# character put in by pdftotext. We only have that when
# we extracted text from PDFs, which we only did for
# the Statutes at Large. We can't do this on modern bills
# where the text came from GPO plain text format.
pp = len([pgtext for pgtext in plain_text.split("\n=============================================\n") if pgtext.strip() != ""])
wds = len(plain_text.split(" "))
counters["pages"].append(pp)
counters["words"].append(wds)
W.writerow([congress, "{}-{}".format(*get_congress_years(congress)),
len(counters["pages"]), sum(counters["pages"]), sum(counters["words"]),
int(round(median(counters["pages"]))), int(round(median(counters["words"]))),
missing_text])
for c in range(82, CURRENT_CONGRESS+1):
count_pages_of_bills(c)