-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
460 lines (394 loc) · 18 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
"""
Scrapes the website of the Federal Reserve for meetings related to the
implementation of Dodd-Frank. It outputs data in a format that is more
easily consumable for our Dodd-Frank Tracker[1]. The data provided here
is still pretty raw, but easily consumable by other tools.
[1] http://reporting.sunlightfoundation.com/doddfrank/
"""
import scraperwiki
import re
import lxml.etree
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib2 import urlopen
from datetime import datetime
from operator import isCallable, itemgetter
from pprint import pprint
BASE = "http://www.federalreserve.gov"
#Dictionary of Category: url pairs:
FED_MEETINGS = {
"Systemic Designations, Enhanced Prudential Standards, and Banking Supervision and Regulation": "/newsevents/reform_systemic.htm",
"Derivatives Markets and Products": "/newsevents/reform_derivatives.htm",
"Interchange Fees": "/newsevents/reform_interchange.htm",
"Payments, Settlement and Clearing Activities and Utilities": "/newsevents/reform_payments.htm",
"Consumer Financial Protection": "/newsevents/reform_consumer.htm",
"Resolution Framework": "/newsevents/reform_resolution.htm"
}
CompanySuffixPattern1 = re.compile(r', (LLC|LLP|MLP|Corp|Corporation|Inc)[.]?', re.IGNORECASE)
CompanySuffixPattern2 = re.compile(r'& Co(mpany|\.)?', re.IGNORECASE)
def fix_company_suffixes(s):
t = CompanySuffixPattern1.sub(r' \1', s)
u = CompanySuffixPattern2.sub(r'', t)
return u
def taketokens(tokens, oftype=None):
def gen():
while len(tokens) > 0:
(toktype, text) = tokens[0]
if oftype is None:
yield tokens.pop(0)
elif oftype == toktype:
yield tokens.pop(0)
else:
return
return list(gen())
def tokenize(elem, url=''):
def stage1(elem):
"""Yields text fragments, accounting for the .text/.tail etree quirk."""
for d in elem.iterdescendants():
for text in [d.text, d.tail]:
if text is not None and len(text.strip()) > 0:
# This used to split text on semicolons (;). I don't even remember
# why it did so, but it was causing 'Michael William' and 'Annette Nazareth'
# to be merged in this doc:
# http://www.federalreserve.gov/newsevents/files/creditsuisse_meeting__20101108b.pdf
yield text
def stage2(tokens):
"""Finds section markers."""
for text in tokens:
colon_offset = text.find(':')
if colon_offset == -1 or text[colon_offset:].startswith('://'):
# Don't split on colons that are part of a URL scheme
if text == 'Summary':
yield ('section_marker', text)
else:
yield ('text', text)
else:
stripped = text.strip()
if stripped.endswith('):'):
yield ('text', stripped[:-1])
elif stripped.lower().endswith('footnote:'):
# Documents like this one have footnotes that mimic section markers:
# http://www.federalreserve.gov/newsevents/files/credit_suisse_meeting__20100920.pdf
yield ('text', text)
else:
prefix = text[:colon_offset].strip()
rest = text[colon_offset+1:].strip()
if prefix:
yield ('section_marker', prefix)
if rest:
yield ('text', rest)
def stage3(tokens):
"""Differentiates section content vs the prelude, accounting for special sections."""
section_name = None
for (toktype, text) in tokens:
if toktype == 'text':
if section_name is None:
yield ('prelude', text)
elif section_name in ('Summary', 'Meeting Summary'):
yield ('summary_text', text)
else:
# Some PDFs have URLs embedded in the list of names
# E.g. http://www.federalreserve.gov/newsevents/files/CAC_Meeting_20110615.pdf
text = URLPattern.sub('', text)
yield ('section_content', text)
elif toktype == 'section_marker':
section_name = text
yield (toktype, text)
else:
yield (toktype, text)
def participants_style(tokens):
"""
While most PDFs use a participant list style like:
first_name1 last_name1 (org_name1), first_name2 last_name2 (org_name2)
Others use PDFs like this though:
first_name1 last_name1, title1, org_name1
first_name2 last_name2, title2, org_name2
Examples of the latter style:
http://www.federalreserve.gov/newsevents/files/CUNA_meeting_20110218.pdf
http://www.federalreserve.gov/newsevents/files/TCH_meeting_20110204.pdf
This one erroneously triggers the parsing logic for the latter style but returns
proper results due to the technically empty Participants section:
http://www.federalreserve.gov/newsevents/files/GE_Capital_meeting_20101217.pdf
TODO: This PDF thwarts the current logic:
http://www.federalreserve.gov/newsevents/files/commerce_meeting_102710.pdf
http://www.federalreserve.gov/newsevents/files/shadow-financial-regulatory-committee-20120212.pdf
HUH?:
http://www.federalreserve.gov/newsevents/files/odfr-meeting-20111004.pdf (See attached two Attendee Lists)
http://www.federalreserve.gov/newsevents/files/SIFMA_Meeting_20110510.pdf (See attached)
http://www.federalreserve.gov/newsevents/files/OTC_industry_meeting_20110127.pdf (See attached participant list)
http://www.federalreserve.gov/newsevents/files/ODRF_meeting_20101102.pdf (See attached three Attendee Lists)
"""
section_name = ''
has_participants_section = False
participants_section_empty = True
and_word_seen = False
open_paren_seen = False
closing_paren_seen = False
for (toktype, text) in tokens:
if toktype == 'section_marker':
section_name = text
if section_name == 'Participants':
has_participants_section = True
elif toktype == 'section_content' and section_name == 'Participants':
if participants_section_empty == True:
participants_section_empty = len(text.strip()) == 0
if open_paren_seen == False:
open_paren_seen = '(' in text
if closing_paren_seen == False:
closing_paren_seen = ')' in text
if and_word_seen == False:
and_word_seen = re.search(r'\band\b', text) is not None
return 'b' if (has_participants_section and not participants_section_empty and not open_paren_seen and not closing_paren_seen and not and_word_seen) else 'a'
def stage4_a(tokens):
"""Collapses section content."""
accum = []
for (toktype, text) in tokens:
if toktype == 'section_marker':
joined = ' '.join(accum)
accum = []
yield ('section_content', joined)
yield (toktype, text)
elif toktype == 'section_content':
accum.append(text.strip())
else:
yield (toktype, text)
def stage4_b(tokens):
"""Collapses section content."""
accum = []
section_name = ''
for (toktype, text) in tokens:
if toktype == 'section_marker':
section_name = text
joined = ' '.join(accum)
accum = []
yield ('section_content', joined)
yield (toktype, text)
elif toktype == 'section_content' and section_name != 'Participants':
accum.append(text.strip())
else:
yield (toktype, text)
def stage5_a(tokens):
"""Identifies organization names that adorn names in the name sections."""
for (toktype, text) in tokens:
if toktype == 'section_content':
if text:
names = splitnames(text)
for name in names:
(name, adornment) = extract_adornment(name)
if name:
yield ('name', name)
if adornment:
adornment = fix_company_suffixes(adornment)
if adornment:
yield ('adornment', adornment)
else:
yield (toktype, text)
def stage5_b(tokens):
"""Identifies organization names that adorn names in the name sections."""
for (toktype, text) in tokens:
if toktype == 'section_content':
if text:
parts = text.split(',')
yield ('name', parts[0].strip())
if len(parts) > 1:
adornment = fix_company_suffixes(parts[-1].strip())
if adornment:
yield ('adornment', adornment)
else:
yield (toktype, text)
early_stage = list(stage3(stage2(stage1(elem))))
if participants_style(early_stage) == 'a':
return stage5_a(stage4_a(stage3(stage2(stage1(elem)))))
else:
print "Using parsing path B for {url}".format(url=url)
return stage5_b(stage4_b(stage3(stage2(stage1(elem)))))
# The mr|mrs|ms gets swallowed as a split as a work-around for python
# not supporting variable-width negative look-behinds.
# http://www.federalreserve.gov/newsevents/files/Meeting-between-Governor-Tarullo-and-Paul-Volcker-20111020.pdf
# This PDF requires the 'including' separator:
# http://www.federalreserve.gov/newsevents/files/CME_meeting_20100824.pdf
# 'PC' stands for professional corporation, seen here:
# http://www.federalreserve.gov/newsevents/files/financial-services-roundtable-meeting-20111120.pdf
NameSplitPattern = re.compile(r'(\([^)]*\(.*?\).*?\))|(\(.*?\))|[;]|(?:(?:mr|mrs|ms)\.)|(?<!\b[A-Z])\. |,(?! (?:I{1,3}\b|jr|sr|esq|inc|corp|llc|p\.?c\.?))|(?:, )?\b(?:and|including)\b', re.IGNORECASE)
def splitnames(names):
names = NameSplitPattern.split(names)
names = [n1
for n1 in
(n.strip() for n in names if n and n.strip())
if n1 != '.'] # We want to keep trailing periods on abbreviations but not sentence enders.
return names
class ParseError(Exception):
def __init__(self, msg, *args, **kwargs):
super(ParseError, self).__init__(msg, *args, **kwargs)
def find_section_marker(tokens):
while True:
try:
token = tokens.pop(0)
(toktype, text) = token
if toktype == 'section_marker':
return text
except IndexError:
raise ParseError('Unexpected end of input while searching for section marker')
OrgAdornmentSuffixPattern = re.compile(u'([^()]*? )?\((.*?)\)$', re.UNICODE)
def extract_adornment(text):
m = OrgAdornmentSuffixPattern.match(text)
if m is None:
return (text, None)
else:
# Some documents quote the organization name. E.g.
# http://www.federalreserve.gov/newsevents/files/NAREIT-meeting-120611.pdf
return (m.group(1), m.group(2).strip('"').strip("'"))
def is_plural(text):
lc_text = text.lower()
if lc_text.endswith(' staff'):
return True
if lc_text.endswith(' board'):
return True
if lc_text.endswith('s') and not lc_text.endswith('ss'):
return True
return False
def parse(tokens, merge_sections=True):
groups = []
taketokens(tokens, oftype='prelude')
while len(tokens) > 0:
section_name = find_section_marker(tokens)
if section_name == 'Summary':
summary_text = ' '.join((text.strip()
for (toktype, text) in taketokens(tokens,
oftype='summary_text')))
groups.append(('Summary', summary_text))
break
else:
while True:
names = [text for (toktype, text) in taketokens(tokens, oftype='name')]
if len(names) == 0:
break
adornments = taketokens(tokens, oftype='adornment')
if len(adornments) == 0:
groups.append((section_name, names))
elif len(adornments) >= 1:
(_, orgname) = adornments[-1]
if is_plural(orgname) or len(names) == 1 or section_name == 'Participants':
groups.append((orgname, names))
else:
if names:
groups.append((section_name, names[:-1]))
groups.append((orgname, [names[-1]]))
if merge_sections == True:
merged = defaultdict(list)
for (section_name, section_content) in groups:
if section_name == 'Summary':
merged['Summary'] = section_content
else:
merged[section_name].extend(section_content)
return merged.items()
return groups
def main():
for category, url in FED_MEETINGS.iteritems():
print "Scraping category {0}".format(category)
scrape_page(BASE+url, category)
def get_pdf_text(root):
'''
Recursively find all the text below the root node of an XML-parsed PDF.
'''
text = ""
if root.text:
text += " " + root.text
for element in root:
text += get_pdf_text(element)
return text
def scrape_pdf(url):
'''
Scrape data from PDF at URL.
'''
try:
#pdf_data = urlopen(url).read()
pdf_data = scraperwiki.scrape(url)
except:
return (None, None, None)
pdf_xml = scraperwiki.pdftoxml(pdf_data)
root = lxml.etree.fromstring(pdf_xml)
page0 = root.find('page')
try:
content = dict(parse(list(tokenize(page0, url))))
except ParseError:
content = None
full_text = get_pdf_text(root)
return pdf_xml, full_text, content
SubCategoryPrefixPattern = re.compile(r'^[a-z]\. ')
def scrape_page(url, category):
'''
Scrapes a single page of the Fed meeting logs.
url: the page url
category: String for the category of all meetings on page.
'''
raw_page = scraperwiki.scrape(url)
page_soup = BeautifulSoup(raw_page)
table = page_soup.find('table', {"class":"earegulate"})
#Find the table and get the enry for each row.
all_entries = []
all_rows = table.find_all("tr")
attendees = []
subcategory = ""
for row in all_rows:
new_entry = {}
cols = row.find_all('td')
if len(cols) != 3:
header = row.find('th')
if "Attachment" not in header.get_text():
subcategory = SubCategoryPrefixPattern.sub('', header.get_text().strip())
print "Subcategory: {0!r} vs {1!r}".format(subcategory, header.get_text().strip())
continue
new_entry['type'] = cols[2].get_text().strip()
if new_entry['type'].lower() not in ('meeting', 'communication', 'presentation'):
continue
new_entry['category'] = category
new_entry['subcategory'] = subcategory
new_entry['fullname'] = cols[0].get_text().strip()
new_entry['name'] = (new_entry['fullname'].replace("(PDF)", "")
.replace("Meeting", "")
.strip())
link = cols[0].find('a').get('href')
if link[:4] != 'http':
link = BASE + link
new_entry['link'] = link
pdf_xml, pdf_text, content = scrape_pdf(link)
#new_entry['pdftext'] = pdf_text
#new_entry['pdfxml'] = pdf_xml
date_raw = cols[1].get_text().strip()
try:
date = datetime.strptime(date_raw, "%m/%d/%Y").date()
except:
date = datetime.strptime(date_raw.split('-')[0], "%m/%d/%Y").date()
print new_entry['name'] + ": " + date_raw
new_entry['date'] = date
all_entries.append(new_entry)
if content:
# This PDF is erroneously skipped by this logic:
# http://www.federalreserve.gov/newsevents/files/ALTA_Meeting_20113131.pdf
summary = content.get('Summary')
if summary is not None:
new_entry['summary'] = summary
for (section, names) in content.iteritems():
if section != 'Summary':
for name in names:
attendees.append({'category': new_entry['category'],
'name': new_entry['name'],
'date': new_entry['date'],
'attendee_name': name,
'affiliation': section})
else:
new_entry['summary'] = None
print "Skipping participant extract for {type} on {date} because the first PDF page has no summary section ({link}).".format(**new_entry)
else:
new_entry['summary'] = None
for meeting in all_entries:
for (k, v) in meeting.iteritems():
if isinstance(v, unicode):
meeting[k] = v.replace(u'\xa0', ' ')
#Update database:
scraperwiki.sqlite.save(['category', 'name', 'date'], all_entries, table_name="MeetingTable1")
scraperwiki.sqlite.save(['category', 'name', 'date', 'attendee_name'], attendees, table_name="AttendeeTable1")
URLPattern = re.compile(ur'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))')
if __name__ == "scraper":
main()