In [1]:

# bulkdownload.py
#
# Downloads all eBooks from a mirror of Project Gutenberg's website, for a specific language.
#
# Software by Michiel Overtoom, motoom@xs4all.nl, July 2009, March 2012. Adapted in 2016 for mirrors.

'''
Scraping eBooks from Gutenbergs web site isn't allowed anymore.
Instead, you look in http://www.gutenberg.org/MIRRORS.ALL for a mirror nearby you.
You might want to choose a HTTP mirror because FTP mirrors are slow with urllib.urlretrieve (but FTP mirrors are OK if you can use wget).
Choose a suitable mirror URL and put it in the MIRROR variable below.

The program then fetches {MIRROR}/GUTINDEX.ZIP, which is the compressed book index.
In this zip is a textfile called GUTINDEX.ALL, in it every eBook is listed starting on the beginning
of a line, followed by lines of attributes:

    Zur Psychopathologie des Alltagslebens, by Sigmund Freud                 24429
      [Subtitle: Uber Vergessen, Versprechen, Vergreifen, Aberglaube und Irrtum]
      [Language: German]
    Hempfield, by David Grayson                                              33251
     [Subtitle: A Novel]
     [Illustrator: Thomas Fogarty]
    De slavernij in Suriname, by Julien Wolbers                              31060
     [Subtitle: of dezelfde gruwelen der slavernij, die in de 'Negerhut'
      geschetst zijn, bestaan ook in onze West-Indische Kolonien]
     [Language: Dutch]
    De schipbreuk van de "Berlin" 21 Februari 1907, by Jean Louis Pisuisse   33254
     [Subtitle: Volledig verhaal van de scheepsramp
      aan den Hoek van Holland]
     [Illustrator: Louis Raemaekers]
     [Language: Dutch]

The first line has a title and an eBook id number ("De slavernij in Suriname, by J.W.  31060").
Now, where to find the eBook text 31060?
For that, the program fetches {MIRROR}/ls-lR.gz, which contains the compressed directory & file index
in a textfile called 'ls-lR'. It contains chunks like:

    ./3/1/0/6/31060:
    total 156
    -rw-rw-r-- 1 gbnewby pg 77617 Jan 24  2010 31060-8.txt
    -rw-rw-r-- 1 gbnewby pg 29926 Jan 24  2010 31060-8.zip
    drwxrwxr-x 3 gbnewby pg  4096 Jan 24  2010 31060-h
    -rw-rw-r-- 1 gbnewby pg 35794 Jan 24  2010 31060-h.zip

We're interested in the file '31060-0.zip', '31060-8.zip' or '31060.zip'.
From the chunk above we learn it can be found in the directory /3/1/0/6/31060, thus:

    {MIRROR}/3/1/0/6/31060/31060-8.zip

This file is downloaded in the directory 'ebooks-zipped', and contains the eBook text '31060-8.txt',
which is eventually extracted into 'ebooks-unzipped'. Other programs take it from there.

'''


import urllib
import re
import os
import zipfile
import gzip
import datetime
import codecs
import glob
import shutil

MIRROR = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/"
LANGUAGE = "English"


def older(a, b):
    '''Return True is file 'a' is older than file 'b'.'''
    if not os.path.exists(a) or not os.path.exists(b):
        return False
    sta = os.stat(a)
    stb = os.stat(b)
    return sta <= stb


def fetch(mirrorurl, filename, outputfilename):
    '''Fetch a file from a gutenberg mirror, if it hasn't been fetched earlier today.'''
    mustdownload = False
    if os.path.exists(filename):
        st = os.stat(filename)
        modified = datetime.date.fromtimestamp(st.st_mtime)
        today = datetime.date.today()
        if modified == today:
            print "%s exists, and is up-to-date. No need to download it." % filename
        else:
            print "%d exists, but is out of date. Downloading..." % filename
            mustdownload = True
    else:
        print "%s not found, downloading..." % filename
        mustdownload = True

    if mustdownload:
        url = mirrorurl + filename
        urllib.urlretrieve(url, outputfilename)


# Ensure directories exist.
if not os.path.exists("../data/Gutenberg"):
    os.mkdir("../data/Gutenberg")
if not os.path.exists("../data/Gutenberg/indexes"):
    os.mkdir("../data/Gutenberg/indexes/")

if not os.path.exists("../data/Gutenberg/ebooks-zipped"):
    os.mkdir("../data/Gutenberg/ebooks-zipped/")

if not os.path.exists("../data/Gutenberg/ebooks-unzipped"):
    os.mkdir("../data/Gutenberg/ebooks-unzipped/")

if not os.path.exists("../data/Gutenberg/ebooks-unzipped/"+LANGUAGE):
    os.mkdir("../data/Gutenberg/ebooks-unzipped/"+LANGUAGE)

# Download the book index, and unzip it.
fetch(MIRROR, "GUTINDEX.zip", "../data/Gutenberg/indexes/GUTINDEX.zip")
if not os.path.exists("../data/Gutenberg/indexes/GUTINDEX.ALL") or older("../data/Gutenberg/indexes/GUTINDEX.ALL",
                                                                         "../data/Gutenberg/indexes/GUTINDEX.zip"):
    print "Extracting GUTINDEX.ALL from GUTINDEX.zip..."
    zipfile.ZipFile("../data/Gutenberg/indexes/GUTINDEX.zip").extractall("../data/Gutenberg/indexes/")


# Download the file index, and gunzip it.
fetch(MIRROR, "ls-lR.gz", "../data/Gutenberg/indexes/ls-lR.gz")
if not os.path.exists("../data/Gutenberg/indexes/ls-lR") or older("../data/Gutenberg/indexes/ls-lR",
                                                                  "../data/Gutenberg/indexes/ls-lR.gz"):
    print "Extracting ls-lR from ls-lR.gz..."
    inf = gzip.open("../data/Gutenberg/indexes/ls-lR.gz", "rb")
    outf = open("../data/Gutenberg/indexes/ls-lR", "wb")
    outf.write(inf.read())
    inf.close()
    outf.close()


# Parse the file index
print "Parsing file index..."
mirrordir = {}
mirrorname = {}
re_txt0file = re.compile(r".*? (\d+\-0\.zip)") # UTF-8 encoded (?)
re_txt8file = re.compile(r".*? (\d+\-8\.zip)") # latin-8 encoded (?)
re_txtfile = re.compile(r".*? (\d+\.zip)") # ascii encoded 
for line in open("../data/Gutenberg/indexes/ls-lR"):
    if line.startswith("./"):
        line = line[2:].strip()
        if line.endswith(":"):
            line = line[:-1]
        if line.endswith("old") or "-" in line:
            continue
        lastseendir = line
        continue
    m = re_txt0file.match(line)
    if not m:
        m = re_txt8file.match(line)
    if not m:
        m = re_txtfile.match(line)
    if m:
        filename = m.groups()[0]
        if "-" in filename: # For filenames like '12104-0.zip'.
            nr, _ = filename.split("-")
        elif "." in filename: # For filenames like '32901.zip'.
            nr, _ = filename.split(".")
        else:
            print "Unexpected filename:", filename
        ebookno = int(nr)
        if not ebookno in mirrordir:
            mirrordir[ebookno] = lastseendir
            mirrorname[ebookno] = filename


# Parse the GUTINDEX.ALL file and extract all language-specific titles from it.
print "Parsing book index..."
inpreamble = True
ebooks = {} # number -> title
ebookslanguage = {} # number -> language
ebookno = None
nr = 0
langre = re.compile(r"\[Language: (\w+)\]")
for line in codecs.open("../data/Gutenberg/indexes/GUTINDEX.ALL", encoding="utf8"):
    line = line.replace(u"\xA0", u" ") # Convert non-breaking spaces to ordinary spaces.

    if inpreamble: # Skip the explanation at the start of the file.
        if "TITLE and AUTHOR" in line and "ETEXT NO." in line:
            inpreamble = False
        else:
            continue

    if not line.strip():
        continue # Ignore empty lines.

    if line.startswith("<==End of GUTINDEX.ALL"):
        break # Done.

    if line.startswith((u" ", u"\t", u"[")):
        # Attribute line; see if it specifies the language.
        m = langre.search(line)
        if m:
            language = m.group(1)
            ebookslanguage[ebookno] = language
    else:
        # Possibly title line: "The German Classics     51389"
        parts = line.strip().rsplit(" ", 1)
        if len(parts) < 2:
            continue
        title, ebookno = parts
        title = title.strip()
        try:
            if ebookno.endswith(("B", "C")):
                ebookno = ebookno[:-1]
            ebookno = int(ebookno)
            # It's a genuine title.
            ebooks[ebookno] = title
        except ValueError:
            continue # Missing or invalid ebook number

# Default language is English; mark every eBook which hasn't a language specified as English.
for nr, title in ebooks.iteritems():
    if not nr in ebookslanguage:
        ebookslanguage[nr] = "English"

if 0:
    # Print report of found eBooks.
    nr = 0
    for ebookno in sorted(ebooks.keys()):
        if ebookslanguage[ebookno] != LANGUAGE:
            continue
        titel = ebooks[ebookno].encode("ascii", "replace")
        filename = mirrorname.get(ebookno, "UNKNOWN")
        filedir = mirrordir.get(ebookno, "UNKNOWN")
        print "%d. %s (%s in %s)" % (ebookno, titel, filename, filedir)
        nr += 1
    print "%d ebooks found for language %s" % (nr, LANGUAGE)

# Fetch the eBook zips.
for nr, ebookno in enumerate(sorted(ebooks.keys())):
    if ebookslanguage[ebookno] != LANGUAGE: # Only fetch books for specified language.
        continue
    filedir = mirrordir.get(ebookno)
    filename = mirrorname.get(ebookno)
    if not filedir or not filename:
        continue
    url = MIRROR + filedir + "/" + filename
    fn = os.path.join("../data/Gutenberg/ebooks-zipped", filename)
    if os.path.exists(fn):
        print "(%d/%d) %s exists, download not necessary" % (nr, len(ebooks), fn)
    else:
        print "(%d/%d) downloading %s..." % (nr, len(ebooks), fn)
        # Slow with FTP mirrors; prefer a HTTP mirror.
        urllib.urlretrieve(url, fn)

        # Fast, but requires external wget utility.
        # cmd = "wget -O %s %s" % (fn, url)
        # os.system(cmd)



GUTINDEX.zip not found, downloading...
ls-lR.gz not found, downloading...
Parsing file index...
Parsing book index...
(2053/54439) downloading ../data/Gutenberg/ebooks-zipped/2054-8.zip...
(2144/54439) downloading ../data/Gutenberg/ebooks-zipped/2146-8.zip...
(2172/54439) downloading ../data/Gutenberg/ebooks-zipped/2174-8.zip...
(2185/54439) downloading ../data/Gutenberg/ebooks-zipped/2187-8.zip...
(2186/54439) downloading ../data/Gutenberg/ebooks-zipped/2188-8.zip...
(2187/54439) downloading ../data/Gutenberg/ebooks-zipped/2189-8.zip...
(2188/54439) downloading ../data/Gutenberg/ebooks-zipped/2190-8.zip...
(2226/54439) downloading ../data/Gutenberg/ebooks-zipped/2228-8.zip...
(2227/54439) downloading ../data/Gutenberg/ebooks-zipped/2229-8.zip...
(2228/54439) downloading ../data/Gutenberg/ebooks-zipped/2230-8.zip...
(2310/54439) downloading ../data/Gutenberg/ebooks-zipped/2312-8.zip...
(2311/54439) downloading ../data/Gutenberg/ebooks-zipped/2313-8.zip...
(2312/54439) downloading ../da

KeyboardInterrupt: 

In [2]:
# Unzip them.
errors = []
for fn in glob.glob("../data/Gutenberg/ebooks-zipped/*.zip"):
    print "extracting", fn
    try:
        zipfile.ZipFile(fn).extractall("../data/Gutenberg/ebooks-unzipped/"+LANGUAGE)
    except zipfile.BadZipfile:
        errors.append("Error: can't unzip %s" % fn) # Some files in the Gutenberg archive are damaged.

# Some extracted files will end up in a subdirectory. Move them up into 'ebooks-unzipped' and remove the empty subdirectory.
for dirn in glob.glob("../data/Gutenberg/ebooks-unzipped/"+LANGUAGE+"/*"):
    if os.path.isdir(dirn):
        print "moving", dirn
        for fn in glob.glob(os.path.join(dirn, "*")):
            parts = fn.split(os.sep)
            ofn = os.path.join("../data/Gutenberg/ebooks-unzipped/"+LANGUAGE, parts[-1])
            if os.path.exists(ofn):
                os.unlink(ofn)
            shutil.move(fn, "../data/Gutenberg/ebooks-unzipped/"+LANGUAGE)
        os.rmdir(dirn)

if errors:
    print "Errors:"
    for error in errors:
        print error


extracting ../data/Gutenberg/ebooks-zipped/10906-8.zip
extracting ../data/Gutenberg/ebooks-zipped/4771-8.zip
extracting ../data/Gutenberg/ebooks-zipped/10346-8.zip
extracting ../data/Gutenberg/ebooks-zipped/8560-8.zip
extracting ../data/Gutenberg/ebooks-zipped/7770-8.zip
extracting ../data/Gutenberg/ebooks-zipped/2314-8.zip
extracting ../data/Gutenberg/ebooks-zipped/11037-8.zip
extracting ../data/Gutenberg/ebooks-zipped/5781-8.zip
extracting ../data/Gutenberg/ebooks-zipped/3000-0.zip
extracting ../data/Gutenberg/ebooks-zipped/4565-8.zip
extracting ../data/Gutenberg/ebooks-zipped/801-8.zip
extracting ../data/Gutenberg/ebooks-zipped/10061-8.zip
extracting ../data/Gutenberg/ebooks-zipped/10680-8.zip
extracting ../data/Gutenberg/ebooks-zipped/11300-8.zip
extracting ../data/Gutenberg/ebooks-zipped/2409-8.zip
extracting ../data/Gutenberg/ebooks-zipped/8946-8.zip
extracting ../data/Gutenberg/ebooks-zipped/8186.zip
extracting ../data/Gutenberg/ebooks-zipped/5711-8.zip
extracting ../data/Gutenb

In [5]:
if not os.path.exists("../data/test"):
    os.mkdir("../data/test")