Skip to content

Commit

Permalink
Merge pull request #23 from deborahgu/path-paremeters
Browse files Browse the repository at this point in the history
Takes more parameters for more modular layout and complex items
  • Loading branch information
deborahgu committed Mar 16, 2018
2 parents 8b4b1d2 + a308d57 commit 928ebd1
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 54 deletions.
35 changes: 23 additions & 12 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,17 @@ From the shell:
The available command line arguments are:

..code:: bash
.. code:: bash
usage: abbyy2epub [-h] [-d] [--epubcheck] [--ace] docname
Process an ABBYY file into an EPUB
positional arguments:
docname A directory containing all the necessary files. See the README
for details.
item_dir The file path where this item's files are kept.
item_identifier The unique ID of this item.
item_bookpath The prefix to a specific book within an item.In a simple
book, usually the same as the item_identifier.
optional arguments:
-h, --help show this help message and exit
Expand Down Expand Up @@ -133,17 +135,23 @@ subdirectory.
Assumptions
===================

This application assumes you are working in a directory which contains a
subdirectory for the document and a specific set of files. If the document is
named ``docname``, the directory structure assumed is:
An item may contain 1 or more books. In order to accommodate this subtlety and
delineate between books, an `item_dir` and `item_identifier` are not sufficient
to isolate a specific book. To circumvent this limitation, we require another
identifier called the `item_bookpath` which acts as a prefix to the files of a
specific book. Given a datanode and an `item_dir` of an item, all the
constituent files for a book can be constructed using `item_identifier` and
`item_bookpath` in the following ways:

.. code:: bash
In order to access the files of an item, you need to know:

docname/
docname_abbyy.gz
docname_meta.xml
docname_jp2.zip
# The `item_identifier` (the unique ID of this item)
# The `item_dir` is the file path where this items files are kept
# The `item_bookpath` is name of the particular book file, often the same as `item_identifier`

The structure is assumed to be:

- ``scandata.xml`` describes the structure of the book (metadata, pages numbers)
* ``docname_abbyy.gz`` unzips to ``docname_abbyy``, an XML file generated by
ABBYY.
* ``docname_jp2.zip`` unzips to a directory called ``docname_jp2``, which
Expand All @@ -152,7 +160,10 @@ named ``docname``, the directory structure assumed is:
* ``docname_0000.jp2`` is scanner calibration.
* ``docname_0001.jp2`` is the cover image and the first image reference in the
ABBYY.

* There is a single global metadata manifest file for the entire
item named ``{item_identifier}_meta.xml``.
* All of the other book specific files follow the form
``{item_bookpath}_{file}``. e.g. ``{item_bookpath}_abbyy.gz``

Further Reading
===============
Expand Down
33 changes: 24 additions & 9 deletions abbyy_to_epub3/commandline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,33 @@

logger = logging.getLogger(__name__)

usage = (
"A directory containing all the necessary files.\n"
"See README at https://github.com/deborahgu/abbyy-to-epub3 for details."
)


def main():
parser = argparse.ArgumentParser(
description='Process an ABBYY file into an EPUB'
description=(
'Process an ABBYY file into an EPUB.\n'
"See README at https://github.com/deborahgu/abbyy-to-epub3 "
"for details."
)
)
parser.add_argument(
'-d',
'--debug',
action='store_true',
help='Show debugging information',
)
parser.add_argument('docname', help=usage)
parser.add_argument(
'item_dir', help="The file path where this item\'s files are kept.",
)
parser.add_argument(
'item_identifier', help="The unique ID of this item.",
)
parser.add_argument(
'item_bookpath', help=(
"The prefix to a specific book within an item."
"In a simple book, usually the same as the item_identifier."
),
)
parser.add_argument(
'--epubcheck',
default=False,
Expand All @@ -59,10 +69,15 @@ def main():
if debug:
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
docname = args.docname
book = create_epub.Ebook(
docname,
args.item_dir,
args.item_identifier,
args.item_bookpath,
debug=debug,
args=args,
)
book.craft_epub()


if __name__ == "__main__":
main()
129 changes: 98 additions & 31 deletions abbyy_to_epub3/create_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,79 @@
config.read(configfile)


class Ebook(object):
class ArchiveBookItem(object):
"""Archive.org is a website which contains an archive of items
composed of archived digital content. Archive.org items are
distributed across a cluster of machines called datanodes. In
order to access the files of an item, you need to know 4 things:
a) The Archive.org `item_identifier` (the unique ID of this item)
e.g. https://archive.org/details/{item_identifier}
b) the datanode server address which hosts this item
c) the `item_dir` which is the file path on this datanode where
this items files are kept
d) the name of the files within this `item_dir`
Certain archive.org items are specifically structured (file
organizations, contents, names) to store and play Books. Every
Archive Book Item contains the following files:
- a jp2.zip containing all the scanned images of the book
- an abbyy file containing the OCR'd plaintest of these scans
- scandata.xml whose metadata describes the structure of the book
(metadata, pages numbers)
- meta.xml which describes the entire archive.org *item*
A complication is that Archive.org Book Items may contain 1 or
more books. In order to accommodate this subtlety and delineate
between books, an `item_dir` and `item_identifier` are not
sufficient to isolate a specific book. To circumvent this
limitation, we require another identifier called the
`item_bookpath` which acts as a prefix to the files of a specific
book. Given a datanode and an `item_dir` of an Archive Book Item,
all the constituent files for a book can be constructed using
`item_identifier` and `item_bookpath` in the following ways:
- There is a single global metadata manifest file for the entire
Archive Item named `{item_identifier}_meta.xml`.
- All of the other book specific files follow the form
`{item_bookpath}_{file}`. e.g. `{item_bookpath}_abbyy.gz`
"""
The Ebook object.
def __init__(self, item_dir, item_identifier, item_bookpath):
self.item_dir = item_dir
self.item_identifier = item_identifier
self.item_bookpath = item_bookpath

# Guarantee all input file exist
# These members will be set as self.`name`_`ext`, e.g. self.meta_xml
input_files = [
# prefix, name, ext
(item_identifier, 'meta', 'xml'),
(item_bookpath, 'abbyy', 'gz'),
(item_bookpath, 'scandata', 'xml'),
(item_bookpath, 'jp2', 'zip')]
for (subdir, name, ext) in input_files:
dependency = os.path.abspath(
os.path.join(item_dir, '%s_%s.%s' % (subdir, name, ext)))
if not os.path.exists(dependency):
self.logger.debug(
"Invalid path to %s.%s: %s" % (name, ext, dependency)
)
raise OSError(
"Invalid path to %s.%s: %s" % (name, ext, dependency)
)
setattr(self, '%s_%s' % (name, ext), dependency)


class Ebook(ArchiveBookItem):
"""
Ebook is a utility for generating epub3 files based on Archive.org items.
Holds extracted information about a book & the ebooklib EPUB object.
"""
def __init__(
self, item_dir, item_identifier, item_bookpath, debug=False, args=False
):

def __init__(self, base, debug=False, args=False):
self.logger = logging.getLogger(__name__)
if debug:
self.logger.addHandler(logging.StreamHandler())
Expand All @@ -63,7 +128,6 @@ def __init__(self, base, debug=False, args=False):
# Initialize all the book's variables cleanly
self.debug = debug
self.args = args
self.base = base # the book's identifier, used in many filename
self.metadata = {} # the book's metadata
self.blocks = [] # all <blocks> with contents, attributes
self.paragraphs = {} # paragraph style info
Expand Down Expand Up @@ -99,16 +163,22 @@ def __init__(self, base, debug=False, args=False):
except (FileNotFoundError, subprocess.CalledProcessError) as e:
self.image_processor = "pillow"

super(Ebook, self).__init__(item_dir, item_identifier, item_bookpath)

self.tmpdir = tempfile.TemporaryDirectory()
self.cover_img = '{}/cover.png'.format(self.tmpdir)
self.abbyy_file = "{tmp}/{item_identifier}_abbyy".format(
tmp=self.tmpdir, item_identifier=self.item_identifier)
self.logger.debug("Temp directory: {}\nidentifier: {}".format(
self.tmpdir, self.item_identifier))

def load_scandata_pages(self):
"""
Parse the page-by-page scandata file. This stores page size,
right or left leaf, and page type (eg copyright, color card, etc).
"""
self.scandata = "{base}/{base}_scandata.xml".format(base=self.base)

# parse the scandata
parser = ScandataParser(
self.scandata,
self.scandata_xml,
self.pages,
debug=self.debug,
)
Expand Down Expand Up @@ -228,19 +298,17 @@ def extract_images(self):
higher premium than disk space, so unzip the entire scan file into temp
directory, instead of extracting only the needed images.
"""
images_zipped = "{base}/{base}_jp2.zip".format(base=self.base)
cover_file = "{tmp}/{base}_jp2/{base}_0001.jp2".format(
tmp=self.tmpdir, base=self.base
cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_0001.jp2".format(
tmp=self.tmpdir, item_bookpath=self.item_bookpath
)
try:
with ZipFile(images_zipped) as f:
with ZipFile(self.jp2_zip) as f:
f.extractall(self.tmpdir)
except BadZipFile as e:
self.logger.error(
"extraction problem with {}".format(images_zipped)
"extraction problem with {}".format(self.jp2_zip)
)
raise BadZipFile

# convert the JP2K file into a usable format for the cover
f, e = os.path.splitext(os.path.basename(cover_file))
imageobj = ImageFactory(self.image_processor)
Expand Down Expand Up @@ -270,9 +338,9 @@ def make_image(self, block):
return

# pad out the filename to four digits
origfile = '{dir}/{base}_jp2/{base}_{page:0>4}.jp2'.format(
origfile = '{dir}/{item_bookpath}_jp2/{item_bookpath}_{page:0>4}.jp2'.format(
dir=self.tmpdir,
base=self.base,
item_bookpath=self.item_bookpath,
page=block['page_no']
)
basefile = 'img_{:0>4}.bmp'.format(self.picnum)
Expand Down Expand Up @@ -843,24 +911,20 @@ def craft_html(self):
)
)

def craft_epub(self):
def craft_epub(self, epub_outfile="out.epub"):
""" Assemble the extracted metadata & text into an EPUB """

# document files and directories
abbyy_file_zipped = "{base}/{base}_abbyy.gz".format(base=self.base)
metadata_file = "{base}/{base}_meta.xml".format(base=self.base)

# Even if we clean up properly afterwards, using TemporaryDirectory
# outside of a convtext manager seems to cause a resource leak
with tempfile.TemporaryDirectory() as self.tmpdir:
self.cover_img = '{}/cover.bmp'.format(self.tmpdir)
self.abbyy_file = "{tmp}/{base}_abbyy".format(
tmp=self.tmpdir, base=self.base
tmp=self.tmpdir, base=self.item_identifier
)
self.logger.debug("Temp directory: {}\nidentifier: {}".format(
self.tmpdir, self.base))
self.tmpdir, self.item_identifier))
# Unzip ABBYY file to disk. (Might be too huge to hold in memory.)
with gzip.open(abbyy_file_zipped, 'rb') as infile:
with gzip.open(self.abbyy_gz, 'rb') as infile:
with open(self.abbyy_file, 'wb') as outfile:
for line in infile:
outfile.write(line)
Expand All @@ -874,7 +938,7 @@ def craft_epub(self):
# parse the ABBYY
parser = AbbyyParser(
self.abbyy_file,
metadata_file,
self.meta_xml,
self.metadata,
self.paragraphs,
self.blocks,
Expand Down Expand Up @@ -966,14 +1030,17 @@ def craft_epub(self):
)
self.book.add_item(css_file)

epub_filename = '{base}/{base}.epub'.format(base=self.base)
epub.write_epub(epub_filename, self.book, {})
if epub_outfile.endswith('.epub'):
epub_outfile = epub_outfile
else:
epub_outfile = '%s.epub' % epub_outfile
epub.write_epub(epub_outfile, self.book, {})

# run checks
verifier = EpubVerify(self.debug)
if self.args and self.args.epubcheck:
self.logger.info("Running EpubCheck on {}".format(epub_filename))
verifier.run_epubcheck(epub_filename)
self.logger.info("Running EpubCheck on {}".format(epub_outfile))
verifier.run_epubcheck(epub_outfile)
if self.args and self.args.ace:
self.logger.info("Running DAISY Ace on {}".format(epub_filename))
verifier.run_ace(epub_filename)
self.logger.info("Running DAISY Ace on {}".format(epub_outfile))
verifier.run_ace(epub_outfile)
7 changes: 6 additions & 1 deletion abbyy_to_epub3/image_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,12 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
try:
i = Image.open(origfile)
except IOError as e:
print("Can't open image {}: {}".format(origfile, e))
self.logger.error(
"Can't open image {}: {}".format(origfile, e)
)
raise Exception(
"Can't open image {}: {}".format(origfile, e)
)
try:
i.crop(dim).save(outfile)
except IOError as e:
Expand Down
1 change: 0 additions & 1 deletion abbyy_to_epub3/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import os

BASE_DIR = os.getcwd()
#BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
APP_DIR = BASE_DIR + '/{}'.format('abbyy_to_epub3')
DOC_DIR = BASE_DIR + '/{}'.format('docs')
TEST_DIR = APP_DIR + '/{}'.format('tests')
Empty file.
Empty file.
Empty file.
Empty file.

0 comments on commit 928ebd1

Please sign in to comment.