Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
drkane committed Jun 3, 2014
1 parent 9d117ca commit 7801124
Showing 1 changed file with 50 additions and 0 deletions.
50 changes: 50 additions & 0 deletions scraper.py
Expand Up @@ -21,3 +21,53 @@
# on Morph for Python (https://github.com/openaustralia/morph-docker-python/blob/master/pip_requirements.txt) and all that matters
# is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
# has at least a table called data.

###############################################################################
# CASC scraper
###############################################################################

import scraperwiki
from BeautifulSoup import BeautifulSoup
import time

# retrieve a page
starting_url = 'http://www.hmrc.gov.uk/casc/clubs.htm'
html = scraperwiki.scrape(starting_url)
#print html
soup = BeautifulSoup(html)

# use BeautifulSoup to get all <td> tags
paras = soup.findAll(id='centre_col')
for a in paras:
anchors = a.findAll('a')
for b in anchors:
try:
length = len(b.text)
except ValueError:
length = 0
if length==1:
page_url = 'http://www.hmrc.gov.uk/casc/' + b['href']
page_html = scraperwiki.scrape(page_url)
#print url
page_soup = BeautifulSoup(page_html)
trs = page_soup.findAll('tr')
for tr in trs:
tds = tr.findAll('td')
if len(tds)==0:
continue
else:
try:
name = tds[0].contents[0]
except IndexError:
name = ""
try:
address = tds[1].contents[0]
except IndexError:
address = ""
try:
postcode = tds[2].contents[0]
except IndexError:
postcode = ""
print name, address, postcode
record = { "name" : name , "address" : address , "postcode" : postcode }
scraperwiki.sqlite.save(["name"], record)

0 comments on commit 7801124

Please sign in to comment.