-
Notifications
You must be signed in to change notification settings - Fork 16
/
convert_rtf.py
156 lines (127 loc) · 5.73 KB
/
convert_rtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import sys
import subprocess
import logging
import logging.config
import sqlalchemy as sa
import datetime
import signal
import os
from django.core.management.base import BaseCommand
from django.conf import settings
from django.db.models import Max
from councilmatic_core.models import Bill
logging.config.dictConfig(settings.LOGGING)
logging.getLogger("requests").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
DB_CONN = 'postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'
engine = sa.create_engine(DB_CONN.format(**settings.DATABASES['default']),
convert_unicode=True,
server_side_cursors=True)
class Command(BaseCommand):
help = 'Converts rtf-formatted legislative text to valid html'
def add_arguments(self, parser):
parser.add_argument(
'--update_all',
default=False,
action='store_true',
help='Update html_text in all bills.')
parser.add_argument(
'--update_empty',
default=False,
action='store_true',
help='Update bills that currently do not have html_text.')
def handle(self, *args, **options):
self.connection = engine.connect()
self.update_all = options['update_all']
self.update_empty = options['update_empty']
'''
This command converts RTF from Legistar into valid HTML.
The conversion employs "unoconv" - a CLI tool that imports and exports documents in LibreOffice. We run unoconv as a daemon process and kill it when the conversions finish.
Three steps occur: (1) querying the database for bill full_text (i.e., the RTF from Legistar),
(2) iteration over the query results, conversion to html, and creation of an inserts string,
(3) updating the full_text field with new html.
'''
listener = subprocess.Popen(['unoconv', '--listener'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
try:
self.add_html()
finally:
listener.terminate()
def get_rtf(self):
self.connection.execute("SET local timezone to '{}'".format(settings.TIME_ZONE))
with engine.begin() as connection:
'''
The script, by default, only converts the most recently updated bills:
the `max_updated` timestamp and the third (last) query option
helps accomplish this.
Specifically, the query determines which bills have been
recently updated (or created) in the Councilmatic database,
by looking for bills with an `updated_at` (i.e., max_updated)
timestamp of equal or greater value.
'''
max_updated = Bill.objects.all().aggregate(Max('updated_at'))['updated_at__max']
if max_updated is None or self.update_all:
query = '''
SELECT ocd_id, full_text
FROM councilmatic_core_bill
WHERE full_text is not null
ORDER BY updated_at DESC
'''
elif self.update_empty:
query = '''
SELECT ocd_id, full_text
FROM councilmatic_core_bill
WHERE html_text is null
AND full_text is not null
ORDER BY updated_at DESC
'''
else:
query = '''
SELECT ocd_id, full_text
FROM councilmatic_core_bill
WHERE updated_at >= :max_updated
AND full_text is not null
ORDER BY updated_at DESC
'''
result = connection.execution_options(stream_results=True).execute(sa.text(query), max_updated=max_updated)
yield from result
def convert_rtf(self):
rtf_results = self.get_rtf()
logger.info('Converting RTF to HTML....')
inserts = ''
for bill_data in rtf_results:
ocd_id = bill_data['ocd_id']
rtf_string = bill_data['full_text']
try:
# For Python 3.4 and below
process = subprocess.Popen(['unoconv', '--stdin', '--stdout', '-f', 'html'], preexec_fn=os.setsid, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL)
html_data, stderr_data = process.communicate(input=rtf_string.encode(), timeout=15)
html = html_data.decode('utf-8')
except subprocess.TimeoutExpired as e:
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
logger.error(e)
logger.error('Look at bill {}'.format(ocd_id))
continue
logger.info('Successful conversion of {}'.format(ocd_id))
yield {'html': html, 'ocd_id': ocd_id}
def add_html(self):
html_results = self.convert_rtf()
self.connection.execute("SET local timezone to '{}'".format(settings.TIME_ZONE))
query = '''
UPDATE councilmatic_core_bill as bills
SET html_text = :html
WHERE bills.ocd_id = :ocd_id
'''
chunk = []
# for html, ocd_id in html_results:
for bill_dict in html_results:
chunk.append(bill_dict)
if len(chunk) == 20:
with self.connection.begin() as trans:
self.connection.execute(sa.text(query), chunk)
chunk = []
# Update bills when less than 1,000 elements in a chunk.
if chunk:
with self.connection.begin() as trans:
self.connection.execute(sa.text(query), chunk)
logger.info('Bills have valid, viewable HTML!')