Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 167 lines (138 sloc) 4.5 KB
#!/usr/bin/env python
Downloads the Kasabi dataset snapshots.
import os
import sys
import boto
import gzip
import rdflib
import logging
import requests
from boto.s3.key import Key
class Dataset:
_ia_bucket = None
def __init__(self, name, license, url): = name
self.license = license
self.url = url
def __str__(self):
return "%s <%s>" % (, self.url)
def all(klass):
for line in open("datasets.csv"):
line = line.strip()
cols = line.split(",")
yield Dataset(*cols)
def triples_size(self):
count = 0
for t in self.triples():
count += 1
return count
def size(self):
r = requests.head(self.url)
return int(r.headers['content-length'])
def ia_size(self):
url = "" + self.ia_name
r = requests.head(url, allow_redirects=True)
return int(r.headers['content-length'])
def download_file(self):
return + ".gz"
def named_graph_file(self):
return + ".nt"
def ia_name(self):
return self.download_file.replace("dataset/", "")
def ia_named_graph_name(self):
return self.download_file.replace("dataset/", "").replace(".gz", ".nt")
def downloaded(self):
return os.path.isfile(ds.download_file)
def archived(self):
return self.get_ia_key() != None
def named_graph_url(self):
"""guesses the named graph URI for the dataset by taking the first one
for quad in self.triples():
u = quad.split(" ")[-2].strip("<").strip(">")
if u.startswith(""):
return u
return None
def named_graph(self):
g = rdflib.Graph()
u = self.named_graph_url
# apparently not all datasets are quads :-/
if not u:
return None
return g.parse(u)
except Exception, e:
logging.error("unable to get named graph url %s: %s", u, e)
def get_ia_key(self):
return Dataset._ia_bucket.get_key(self.ia_name)
def get_ia_named_graph_key(self):
return Dataset._oa_bucket.get_eky(self.ia_named_graph_name)
def triples(self):
if not os.path.isfile(self.download_file):
for line in
yield line
def download(self):"downloading: %s", ds.download_file)
r = requests.get(self.url)
f = open(self.download_file, "wb")
for buff in r.iter_content(chunk_size=65536):
def download_named_graph(self):
g = self.named_graph
if not g:"unable to download named graph for %s", self)
return None"saving named graph for %s as %s", self, self.named_graph_file)
g.serialize(open(self.named_graph_file, "w"), format="nt")
def archive(self):
def archive_dataset(self):
k = self.get_ia_key()
if not k:
k = Key(Dataset._ia_bucket) = self.ia_name
def archive_named_graph(self):
if self.named_graph:"archiving named graph %s as %s", self, self.ia_named_graph_name)
k = Key(Dataset._ia_bucket) = self.ia_named_graph_name
logging.warn("no named graph for %s", self)
def connect_ia(self):
if not Dataset._ia_bucket:
ia = boto.connect_ia()
Dataset._ia_bucket = ia.get_bucket("kasabi")
Dataset._ia__bucket = ia.create_bucket("kasabi")
if __name__ == "__main__":
logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
for ds in Dataset.all():
if not ds.downloaded:
print "downloading %s" % ds
if not ds.archived:
print "archiving %s" % ds"archiving: %s", ds)