-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
104 lines (80 loc) · 3.43 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# coding=utf-8
# This is a morph.io scraper
import scraperwiki
import lxml.html
import re
import urlparse
def scrape_url(url):
html = scraperwiki.scrape(url)
sp_root = lxml.html.fromstring(html)
title = sp_root.cssselect('#ctl00_PlaceHolderMain_pageTitle')
region_name = title[0].text.split(':')[-1].strip()
links = sp_root.cssselect('.memberContainer h2 a')
print '{} members found for {}'.format(len(links), region_name)
for a in links:
scrape_person(a.get('href'), region_name)
name_re = re.compile(r'([\w\s-]*\w)\s*(?:\(([\w\s]*)\))?')
names = set()
def scrape_person(url, region_name):
am = {}
am['href'] = url
am['id'] = url.rsplit('=', 1)[-1]
am_html = scraperwiki.scrape(url)
am_root = lxml.html.fromstring(am_html)
name = am_root.cssselect('h1')[0].text_content().strip()
print 'Processing {}'.format(name)
# We don't need the 'AM' suffix - they all have that.
if name.endswith(' AM'):
name = name[:-3]
name, other_name = name_re.match(name).groups()
am['name'] = name
am['other_name'] = other_name
if name in names:
print "WARNING: duplicate name {}"
names.add(name)
sidebar_spans = am_root.cssselect('div.mgUserSideBar p span.mgLabel')
for span in sidebar_spans:
span_text = span.text.strip()
span_tail = span.tail.strip()
if span_text == 'Title:':
title = am['en_title'] = span_tail
elif span_text == 'Party:':
group = am['group'] = am['en_party_name'] = span_tail
elif span_text == 'Constituency:':
am['en_constituency_name'] = span_tail
elif span_text == 'Region:':
am['en_region_name'] = span_tail
am['area'] = am.get('en_constituency_name') or am.get('en_region_name')
area_id = 'ocd-division/country:gb-wls/region:%s' % region_name
constituency = am.get('en_constituency_name')
if constituency:
area_id = area_id + '/constituency:%s' % constituency
am['area_id'] = area_id.replace(' ', '_').lower()
if 'en_title' in am:
if title == 'Commissioner':
am['post'] = 'Commissioner-{}'.format(group)
else:
am['post'] = title
am['image'] = urlparse.urljoin(
url,
am_root.cssselect('div.mgBigPhoto img')[0].attrib.get('src'),
)
msg_body_spans = am_root.cssselect('div.mgUserBody p span.mgLabel')
for span in msg_body_spans:
span_text = span.text.strip()
span_tail = span.tail.strip()
if 'Twitter' in span_text:
am['twitter'] = span.getparent().find('a').get('href')
elif 'Email' in span_text:
am['email'] = span.getparent().find('a').get('href').replace('mailto:', '')
scraperwiki.sqlite.save(unique_keys=['name'], data=am)
for n in range(1, 6):
url_template = 'http://www.assembly.wales/en/memhome/Pages/membersearchresults.aspx?region={}'
url = url_template.format(n)
scrape_url(url)
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
# You don't have to do things with the ScraperWiki and lxml libraries. You can use whatever libraries are installed
# on Morph for Python (https://github.com/openaustralia/morph-docker-python/blob/master/pip_requirements.txt) and all that matters
# is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
# has at least a table called data.