Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 47 lines (35 sloc) 1.33 KB
#!/usr/bin/env python
Crawl the rdfa at and store them away in a BerkeleyDB triplestore.
import re
import urllib
import rdflib
# set the user agent so that data-gov-uk will know who we are
rdflib.parser.headers = {"User-agent": "data-gov-uk-harvester <>"}
graph = rdflib.ConjunctiveGraph('Sleepycat')'store', create=True)
# the paged package listing that we will crawl to discover dataset urls
page_url_tmpl = ""
page = 0
# extract rdf from each dataset html/rdfa page
while True:
page += 1
page_url = page_url_tmpl % page
print "fetching list of datasets from %s" % page_url
html = urllib.urlopen(page_url).read()
found = 0
for dataset_url in re.findall(r'"(*?)"', html):
found += 1
print "fetching dataset %s" % dataset_url
graph.parse(location=dataset_url, format='rdfa', lax=True)
except Exception, e:
print e
if found == 0:
# no sense in keeping tons of css stylesheet assertions is there?
for t in graph.triples((None, rdflib.URIRef(''), None)):
graph.serialize(open('data.rdf', 'w'))