Skip to content

Commit

Permalink
Split email finding to a function, and cope if it's missing.
Browse files Browse the repository at this point in the history
  • Loading branch information
Duncan Parkes committed Jun 8, 2015
1 parent a516d48 commit 5f39b3c
Showing 1 changed file with 31 additions and 12 deletions.
43 changes: 31 additions & 12 deletions scraper.py
Expand Up @@ -16,6 +16,22 @@
)


def unjs_email(script):
"""Takes a javascript email mangling script and returns the email address."""

# Get hold of the lines of javascript which aren't fiddling with the DOM
jslines = [x.strip() for x in re.search(r'<!--(.*)//-->', script, re.M | re.S).group(1).strip().splitlines() if not x.strip().startswith('document')]

# The name of the variable containing the variable containing the email address
# varies, so find it by regex.
varname = re.search(r'var (addy\d+)', script).group(1)
jslines.append('return {}'.format(varname))

js = '(function() {{{}}})()'.format(' '.join(jslines))

return unescape(execjs.eval(js))


data = {}
term_data = []

Expand Down Expand Up @@ -67,22 +83,25 @@ def handle_chamber(chamber_name, source_url, data, term_data):
# http://www.parliament.gov.na/index.php?option=com_contact&view=category&id=104&Itemid=1479&limitstart=40
member['party'] = ''

# .jsn-table-column-email contains the email address, but only with
# javascript turned on.

mailto_script = tr.cssselect('.jsn-table-column-email')[0].getchildren()[0].text_content()
try:
script = tr.cssselect('.jsn-table-column-email')[0].getchildren()[0].text_content()
except (AttributeError, IndexError):
# No no email for this person.
script = None
else:
member['email'] = unjs_email(script)


# Get hold of the lines of javascript which aren't fiddling with the DOM
jslines = [x.strip() for x in re.search(r'<!--(.*)//-->', mailto_script, re.M | re.S).group(1).strip().splitlines() if not x.strip().startswith('document')]
# # Get hold of the lines of javascript which aren't fiddling with the DOM
# jslines = [x.strip() for x in re.search(r'<!--(.*)//-->', mailto_script, re.M | re.S).group(1).strip().splitlines() if not x.strip().startswith('document')]

# The name of the variable containing the variable containing the email address
# varies, so find it by regex.
varname = re.search(r'var (addy\d+)', mailto_script).group(1)
jslines.append('return {}'.format(varname))
# # The name of the variable containing the variable containing the email address
# # varies, so find it by regex.
# varname = re.search(r'var (addy\d+)', mailto_script).group(1)
# jslines.append('return {}'.format(varname))

js = '(function() {{{}}})()'.format(' '.join(jslines))
member['email'] = unescape(execjs.eval(js))
# js = '(function() {{{}}})()'.format(' '.join(jslines))
# member['email'] = unescape(execjs.eval(js))

details_resp = requests.get(details_url)
details_root = lxml.html.fromstring(details_resp.text)
Expand Down

0 comments on commit 5f39b3c

Please sign in to comment.