Merge pull request #23 from deborahgu/path-paremeters

Takes more parameters for more modular layout and complex items
deborahgu · Mar 16, 2018 · 928ebd1 · 928ebd1
2 parents 8b4b1d2 + a308d57
commit 928ebd1
Show file tree

Hide file tree

Showing 9 changed files with 151 additions and 54 deletions.
diff --git a/README.rst b/README.rst
@@ -59,15 +59,17 @@ From the shell:
 
 The available command line arguments are:
 
-..code:: bash
+.. code:: bash 
 
     usage: abbyy2epub [-h] [-d] [--epubcheck] [--ace] docname
 
     Process an ABBYY file into an EPUB
     
     positional arguments:
-      docname      A directory containing all the necessary files. See the README
-                   for details.
+      item_dir         The file path where this item's files are kept.
+      item_identifier  The unique ID of this item.
+      item_bookpath    The prefix to a specific book within an item.In a simple
+                   book, usually the same as the item_identifier.
     
     optional arguments:
       -h, --help   show this help message and exit
@@ -133,17 +135,23 @@ subdirectory.
 Assumptions
 ===================
 
-This application assumes you are working in a directory which contains a
-subdirectory for the document and a specific set of files. If the document is
-named ``docname``, the directory structure assumed is:
+An item may contain 1 or more books. In order to accommodate this subtlety and
+delineate between books, an `item_dir` and `item_identifier` are not sufficient
+to isolate a specific book. To circumvent this limitation, we require another
+identifier called the `item_bookpath` which acts as a prefix to the files of a
+specific book. Given a datanode and an `item_dir` of an item, all the
+constituent files for a book can be constructed using `item_identifier` and
+`item_bookpath` in the following ways:
 
-.. code:: bash 
+In order to access the files of an item, you need to know:
 
-    docname/
-        docname_abbyy.gz
-        docname_meta.xml
-        docname_jp2.zip
+# The `item_identifier` (the unique ID of this item)
+# The `item_dir` is the file path where this items files are kept
+# The `item_bookpath` is name of the particular book file, often the same as `item_identifier`
 
+The structure is assumed to be:
+
+- ``scandata.xml`` describes the structure of the book (metadata, pages numbers)
 * ``docname_abbyy.gz`` unzips to ``docname_abbyy``, an XML file generated by
   ABBYY.
 * ``docname_jp2.zip`` unzips to a directory called ``docname_jp2``, which
@@ -152,7 +160,10 @@ named ``docname``, the directory structure assumed is:
   * ``docname_0000.jp2`` is scanner calibration.
   * ``docname_0001.jp2`` is the cover image and the first image reference in the
     ABBYY.
-
+* There is a single global metadata manifest file for the entire
+  item named ``{item_identifier}_meta.xml``.
+* All of the other book specific files follow the form
+  ``{item_bookpath}_{file}``. e.g. ``{item_bookpath}_abbyy.gz``
 
 Further Reading
 ===============

diff --git a/abbyy_to_epub3/commandline.py b/abbyy_to_epub3/commandline.py
@@ -23,23 +23,33 @@
 
 logger = logging.getLogger(__name__)
 
-usage = (
-    "A directory containing all the necessary files.\n"
-    "See README at https://github.com/deborahgu/abbyy-to-epub3 for details."
-)
-
 
 def main():
     parser = argparse.ArgumentParser(
-        description='Process an ABBYY file into an EPUB'
+        description=(
+            'Process an ABBYY file into an EPUB.\n'
+            "See README at https://github.com/deborahgu/abbyy-to-epub3 "
+            "for details."
+        )
     )
     parser.add_argument(
         '-d',
         '--debug',
         action='store_true',
         help='Show debugging information',
     )
-    parser.add_argument('docname', help=usage)
+    parser.add_argument(
+        'item_dir', help="The file path where this item\'s files are kept.",
+    )
+    parser.add_argument(
+        'item_identifier', help="The unique ID of this item.",
+    )
+    parser.add_argument(
+        'item_bookpath', help=(
+            "The prefix to a specific book within an item."
+            "In a simple book, usually the same as the item_identifier."
+        ),
+    )
     parser.add_argument(
         '--epubcheck',
         default=False,
@@ -59,10 +69,15 @@ def main():
         if debug:
             logger.addHandler(logging.StreamHandler())
             logger.setLevel(logging.DEBUG)
-        docname = args.docname
         book = create_epub.Ebook(
-            docname,
+            args.item_dir,
+            args.item_identifier,
+            args.item_bookpath,
             debug=debug,
             args=args,
         )
         book.craft_epub()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/abbyy_to_epub3/create_epub.py b/abbyy_to_epub3/create_epub.py
@@ -47,14 +47,79 @@
 config.read(configfile)
 
 
-class Ebook(object):
+class ArchiveBookItem(object):
+    """Archive.org is a website which contains an archive of items
+    composed of archived digital content. Archive.org items are
+    distributed across a cluster of machines called datanodes. In
+    order to access the files of an item, you need to know 4 things:
+
+    a) The Archive.org `item_identifier` (the unique ID of this item)
+       e.g. https://archive.org/details/{item_identifier}
+    b) the datanode server address which hosts this item
+    c) the `item_dir` which is the file path on this datanode where
+       this items files are kept
+    d) the name of the files within this `item_dir`
+
+    Certain archive.org items are specifically structured (file
+    organizations, contents, names) to store and play Books. Every
+    Archive Book Item contains the following files:
+    - a jp2.zip containing all the scanned images of the book
+    - an abbyy file containing the OCR'd plaintest of these scans
+    - scandata.xml whose metadata describes the structure of the book
+      (metadata, pages numbers)
+    - meta.xml which describes the entire archive.org *item*
+
+    A complication is that Archive.org Book Items may contain 1 or
+    more books. In order to accommodate this subtlety and delineate
+    between books, an `item_dir` and `item_identifier` are not
+    sufficient to isolate a specific book. To circumvent this
+    limitation, we require another identifier called the
+    `item_bookpath` which acts as a prefix to the files of a specific
+    book. Given a datanode and an `item_dir` of an Archive Book Item,
+    all the constituent files for a book can be constructed using
+    `item_identifier` and `item_bookpath` in the following ways:
+
+    - There is a single global metadata manifest file for the entire
+      Archive Item named `{item_identifier}_meta.xml`.
+    - All of the other book specific files follow the form
+      `{item_bookpath}_{file}`. e.g. `{item_bookpath}_abbyy.gz`
+
     """
-    The Ebook object.
+    def __init__(self, item_dir, item_identifier, item_bookpath):
+        self.item_dir = item_dir
+        self.item_identifier = item_identifier
+        self.item_bookpath = item_bookpath
+
+        # Guarantee all input file exist
+        # These members will be set as self.`name`_`ext`, e.g. self.meta_xml
+        input_files = [
+            # prefix, name, ext
+            (item_identifier, 'meta', 'xml'),
+            (item_bookpath, 'abbyy', 'gz'),
+            (item_bookpath, 'scandata', 'xml'),
+            (item_bookpath, 'jp2', 'zip')]
+        for (subdir, name, ext) in input_files:
+            dependency = os.path.abspath(
+                os.path.join(item_dir, '%s_%s.%s' % (subdir, name, ext)))
+            if not os.path.exists(dependency):
+                self.logger.debug(
+                    "Invalid path to %s.%s: %s" % (name, ext, dependency)
+                )
+                raise OSError(
+                    "Invalid path to %s.%s: %s" % (name, ext, dependency)
+                )
+            setattr(self, '%s_%s' % (name, ext), dependency)
+
 
+class Ebook(ArchiveBookItem):
+    """
+    Ebook is a utility for generating epub3 files based on Archive.org items.
     Holds extracted information about a book & the ebooklib EPUB object.
     """
+    def __init__(
+        self, item_dir, item_identifier, item_bookpath, debug=False, args=False
+    ):
 
-    def __init__(self, base, debug=False, args=False):
         self.logger = logging.getLogger(__name__)
         if debug:
             self.logger.addHandler(logging.StreamHandler())
@@ -63,7 +128,6 @@ def __init__(self, base, debug=False, args=False):
         # Initialize all the book's variables cleanly
         self.debug = debug
         self.args = args
-        self.base = base       # the book's identifier, used in many filename
         self.metadata = {}     # the book's metadata
         self.blocks = []       # all <blocks> with contents, attributes
         self.paragraphs = {}   # paragraph style info
@@ -99,16 +163,22 @@ def __init__(self, base, debug=False, args=False):
         except (FileNotFoundError, subprocess.CalledProcessError) as e:
             self.image_processor = "pillow"
 
+        super(Ebook, self).__init__(item_dir, item_identifier, item_bookpath)
+
+        self.tmpdir = tempfile.TemporaryDirectory()
+        self.cover_img = '{}/cover.png'.format(self.tmpdir)
+        self.abbyy_file = "{tmp}/{item_identifier}_abbyy".format(
+            tmp=self.tmpdir, item_identifier=self.item_identifier)
+        self.logger.debug("Temp directory: {}\nidentifier: {}".format(
+            self.tmpdir, self.item_identifier))
+
     def load_scandata_pages(self):
         """
         Parse the page-by-page scandata file. This stores page size,
         right or left leaf, and page type (eg copyright, color card, etc).
         """
-        self.scandata = "{base}/{base}_scandata.xml".format(base=self.base)
-
-        # parse the scandata
         parser = ScandataParser(
-            self.scandata,
+            self.scandata_xml,
             self.pages,
             debug=self.debug,
         )
@@ -228,19 +298,17 @@ def extract_images(self):
         higher premium than disk space, so unzip the entire scan file into temp
         directory, instead of extracting only the needed images.
         """
-        images_zipped = "{base}/{base}_jp2.zip".format(base=self.base)
-        cover_file = "{tmp}/{base}_jp2/{base}_0001.jp2".format(
-            tmp=self.tmpdir, base=self.base
+        cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_0001.jp2".format(
+            tmp=self.tmpdir, item_bookpath=self.item_bookpath
         )
         try:
-            with ZipFile(images_zipped) as f:
+            with ZipFile(self.jp2_zip) as f:
                 f.extractall(self.tmpdir)
         except BadZipFile as e:
             self.logger.error(
-                "extraction problem with {}".format(images_zipped)
+                "extraction problem with {}".format(self.jp2_zip)
             )
             raise BadZipFile
-
         # convert the JP2K file into a usable format for the cover
         f, e = os.path.splitext(os.path.basename(cover_file))
         imageobj = ImageFactory(self.image_processor)
@@ -270,9 +338,9 @@ def make_image(self, block):
             return
 
         # pad out the filename to four digits
-        origfile = '{dir}/{base}_jp2/{base}_{page:0>4}.jp2'.format(
+        origfile = '{dir}/{item_bookpath}_jp2/{item_bookpath}_{page:0>4}.jp2'.format(
             dir=self.tmpdir,
-            base=self.base,
+            item_bookpath=self.item_bookpath,
             page=block['page_no']
         )
         basefile = 'img_{:0>4}.bmp'.format(self.picnum)
@@ -843,24 +911,20 @@ def craft_html(self):
                     )
                 )
 
-    def craft_epub(self):
+    def craft_epub(self, epub_outfile="out.epub"):
         """ Assemble the extracted metadata & text into an EPUB  """
 
-        # document files and directories
-        abbyy_file_zipped = "{base}/{base}_abbyy.gz".format(base=self.base)
-        metadata_file = "{base}/{base}_meta.xml".format(base=self.base)
-
         # Even if we clean up properly afterwards, using TemporaryDirectory
         # outside of a convtext manager seems to cause a resource leak
         with tempfile.TemporaryDirectory() as self.tmpdir:
             self.cover_img = '{}/cover.bmp'.format(self.tmpdir)
             self.abbyy_file = "{tmp}/{base}_abbyy".format(
-                tmp=self.tmpdir, base=self.base
+                tmp=self.tmpdir, base=self.item_identifier
             )
             self.logger.debug("Temp directory: {}\nidentifier: {}".format(
-                self.tmpdir, self.base))
+                self.tmpdir, self.item_identifier))
             # Unzip ABBYY file to disk. (Might be too huge to hold in memory.)
-            with gzip.open(abbyy_file_zipped, 'rb') as infile:
+            with gzip.open(self.abbyy_gz, 'rb') as infile:
                 with open(self.abbyy_file, 'wb') as outfile:
                     for line in infile:
                         outfile.write(line)
@@ -874,7 +938,7 @@ def craft_epub(self):
             # parse the ABBYY
             parser = AbbyyParser(
                 self.abbyy_file,
-                metadata_file,
+                self.meta_xml,
                 self.metadata,
                 self.paragraphs,
                 self.blocks,
@@ -966,14 +1030,17 @@ def craft_epub(self):
         )
         self.book.add_item(css_file)
 
-        epub_filename = '{base}/{base}.epub'.format(base=self.base)
-        epub.write_epub(epub_filename, self.book, {})
+        if epub_outfile.endswith('.epub'):
+            epub_outfile = epub_outfile
+        else:
+            epub_outfile = '%s.epub' % epub_outfile
+        epub.write_epub(epub_outfile, self.book, {})
 
         # run checks
         verifier = EpubVerify(self.debug)
         if self.args and self.args.epubcheck:
-            self.logger.info("Running EpubCheck on {}".format(epub_filename))
-            verifier.run_epubcheck(epub_filename)
+            self.logger.info("Running EpubCheck on {}".format(epub_outfile))
+            verifier.run_epubcheck(epub_outfile)
         if self.args and self.args.ace:
-            self.logger.info("Running DAISY Ace on {}".format(epub_filename))
-            verifier.run_ace(epub_filename)
+            self.logger.info("Running DAISY Ace on {}".format(epub_outfile))
+            verifier.run_ace(epub_outfile)
diff --git a/abbyy_to_epub3/image_processing.py b/abbyy_to_epub3/image_processing.py
@@ -113,7 +113,12 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
                 try:
                     i = Image.open(origfile)
                 except IOError as e:
-                    print("Can't open image {}: {}".format(origfile, e))
+                    self.logger.error(
+                        "Can't open image {}: {}".format(origfile, e)
+                    )
+                    raise Exception(
+                        "Can't open image {}: {}".format(origfile, e)
+                    )
                 try:
                     i.crop(dim).save(outfile)
                 except IOError as e:

diff --git a/abbyy_to_epub3/settings.py b/abbyy_to_epub3/settings.py
@@ -19,7 +19,6 @@
 import os
 
 BASE_DIR = os.getcwd()
-#BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 APP_DIR = BASE_DIR + '/{}'.format('abbyy_to_epub3')
 DOC_DIR = BASE_DIR + '/{}'.format('docs')
 TEST_DIR = APP_DIR + '/{}'.format('tests')
diff --git a/abbyy_to_epub3/tests/item_dir/item_bookpath_abbyy.gz b/abbyy_to_epub3/tests/item_dir/item_bookpath_abbyy.gz
diff --git a/abbyy_to_epub3/tests/item_dir/item_bookpath_jp2.zip b/abbyy_to_epub3/tests/item_dir/item_bookpath_jp2.zip
diff --git a/abbyy_to_epub3/tests/item_dir/item_bookpath_scandata.xml b/abbyy_to_epub3/tests/item_dir/item_bookpath_scandata.xml
diff --git a/abbyy_to_epub3/tests/item_dir/item_identifier_meta.xml b/abbyy_to_epub3/tests/item_dir/item_identifier_meta.xml