Skip to content

Commit

Permalink
Spot duplicate (name/term) pairs and warn.
Browse files Browse the repository at this point in the history
  • Loading branch information
Duncan Parkes committed May 18, 2015
1 parent 5d8aaed commit 217249d
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
terms = [(x.find('span').text.strip(), urljoin(source_url, x.get('href')))
for x in root.cssselect('.menu-treemenu')[0].cssselect('a')]

data = []
data = {}

for term_name, term_url in terms:
while term_url:
Expand Down Expand Up @@ -47,14 +47,18 @@
# .jsn-table-column-email contains the email address, but only with
# javascript turned on.

details_resp = requests.get(details_url)
details_root = lxml.html.fromstring(details_resp.text)
# details_resp = requests.get(details_url)
# details_root = lxml.html.fromstring(details_resp.text)
# import pdb;pdb.set_trace()

data.append(member)
key = (member['name'], member['term'])
if key in data:
print "Duplicate (name, term) pair ignored: ({}, {})".format(*key)
else:
data[key] = member

next_links = term_root.cssselect('a[title=Next]')
term_url = urljoin(term_url, next_links[0].get('href')) if next_links else None

scraperwiki.sqlite.save(unique_keys=['name', 'term'], data=data)
scraperwiki.sqlite.save(unique_keys=['name', 'term'], data=data.values())

0 comments on commit 217249d

Please sign in to comment.