Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Uri Laserson January 02, 2013
file 32 lines (28 sloc) 0.967 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
import sys
import random
import subprocess

GB = 1024 ** 3
def du():
    p = subprocess.Popen('hadoop fs -du -s /ngrams', shell=True, stdout=subprocess.PIPE)
    return int(p.stdout.read().split()[0])

# generate list of URLs
base_url = 'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-%igram-20090715-%i.csv.zip'
sizes = [(2, 100), (3, 200), (4, 400), (5, 800)]
ngram_urls = []
for size in sizes:
    n = size[0]
    num_files = size[1]
    for i in xrange(num_files):
        ngram_urls.append(base_url % (n, i))

# download data directly into HDFS
stream_cmd = 'curl "%s" | funzip | hadoop fs -put - /ngrams/%s'
random.shuffle(ngram_urls)
finished = False
while not finished:
    url = ngram_urls.pop()
    filename = '.'.join(url.split('/')[-1].split('.')[:-1])
    sys.stdout.write("%s\n" % filename)
    sys.stdout.flush()
    subprocess.Popen(stream_cmd % (url, filename), shell=True).wait()
    if du() > 20 * GB:
        finished = True
Something went wrong with that request. Please try again.