Merge pull request #32 from deborahgu/31-addtoaccessformats

FIXES #31. Uses Scandata to ignore pages w/ addToAccessFormats=False. Also:
deborahgu · Apr 3, 2018 · eac106c · eac106c
2 parents 61d0b5f + 1bc0de1
commit eac106c
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 19 deletions.
diff --git a/abbyy_to_epub3/constants.py b/abbyy_to_epub3/constants.py
@@ -27,9 +27,13 @@
 }
 
 # Some page types should always be skipped
+# Instead of complicating the data structure and adding extra logic
+# on each block, just use a custom pagetype for anything where
+# "addToAccessFormats" is set to false. This is 'skippable.'
 skippable_pages = [
     'cover',
     'copyright',
     'color card',
+    'skippable',
     'title',
 ]
diff --git a/abbyy_to_epub3/create_epub.py b/abbyy_to_epub3/create_epub.py
@@ -323,7 +323,8 @@ def extract_images(self):
             self.logger.error(e)
             raise RuntimeError(e)
 
-        cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_000{num}.jp2".format(
+        # pad out the filename to four digits
+        cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_{num:0>4}.jp2".format(
             tmp=self.tmpdir, item_bookpath=self.item_bookpath, num=cover_leaf
         )
         try:
@@ -337,7 +338,11 @@ def extract_images(self):
         # convert the JP2K file into a usable format for the cover
         f, e = os.path.splitext(os.path.basename(cover_file))
         imageobj = ImageFactory(self.image_processor)
-        imageobj.crop_image(cover_file, self.cover_img)
+        try:
+            imageobj.crop_image(cover_file, self.cover_img)
+        except RuntimeError as e:
+            # for failed image creation, keep processing the epub
+            self.logger.error(e)
 
     def image_dim(self, block):
         """
@@ -394,7 +399,12 @@ def make_image(self, block):
 
         # make the image:
         imageobj = ImageFactory(self.image_processor)
-        imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim)
+        try:
+            imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim)
+        except RuntimeError as e:
+            # for failed image creation, keep processing the epub
+            self.logger.error(e)
+            return ''
         epubimage = epub.EpubImage()
         epubimage.file_name = in_epub_imagefile
         with open(outfile, 'rb') as f:
@@ -775,6 +785,9 @@ def craft_html(self):
             ):
                 prev_pagetype = pagetype
                 pagetype = self.pages[block['page_no']]
+            else:
+                # Treat it as Normal if it's not set
+                pagetype = 'Normal'
             if pagetype in skippable_pages:
                 continue
 
@@ -918,7 +931,10 @@ def craft_html(self):
                     chapter.content += ebooklib_utils.create_pagebreak(
                         str(block['text'])
                     )
-            elif block['type'] == 'Picture':
+            elif (
+                block['type'] == 'Picture' and
+                pagetype != 'Cover'
+            ):
                 # Image
                 content = self.make_image(block)
                 if content:

diff --git a/abbyy_to_epub3/image_processing.py b/abbyy_to_epub3/image_processing.py
@@ -73,15 +73,13 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
                             cmd, stdout=subprocess.DEVNULL, check=True
                         )
                     except subprocess.CalledProcessError as e:
-                        self.logger.warning(
+                        raise RuntimeError(
                             "Can't save cropped image: {}".format(e)
                         )
-                        return
                 else:
-                    self.logger.warning(
+                    raise RuntimeError(
                         "Can't crop in Kakadu without page dimensions"
                     )
-                    return
             else:
                 # without dimensions, save the entire uncropped image
                 cmd = [
@@ -94,10 +92,9 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
                         cmd, stdout=subprocess.DEVNULL, check=True
                     )
                 except subprocess.CalledProcessError as e:
-                    self.logger.warning(
-                        "Can't save uncropped image: {}".format(e)
+                    raise RuntimeError(
+                        "Can't open image {}: {}".format(origfile, e)
                     )
-                    return
 
     class PillowProcessor(ImageProcessor):
 
@@ -113,16 +110,12 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
                 try:
                     i = Image.open(origfile)
                 except IOError as e:
-                    self.logger.error(
-                        "Can't open image {}: {}".format(origfile, e)
-                    )
-                    raise Exception(
-                        "Can't open image {}: {}".format(origfile, e)
-                    )
+                    raise RuntimeError(
+                        "Can't open image {}: {}".format(origfile, e))
                 try:
                     i.crop(dim).save(outfile)
                 except IOError as e:
-                    self.logger.warning(
+                    raise RuntimeError(
                         "Can't crop image {} & save to {}: {}".format(
                             origfile, outfile, e
                         )
@@ -132,7 +125,7 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
                 try:
                     Image.open(origfile).save(outfile)
                 except IOError as e:
-                    self.logger.warning(
+                    raise RuntimeError(
                         "Cannot create cover file: {}".format(e)
                     )
 

diff --git a/abbyy_to_epub3/parse_scandata.py b/abbyy_to_epub3/parse_scandata.py
@@ -46,4 +46,10 @@ def parse_scandata(self):
             num = page.get('leafNum')
             # In case contributors use inconsistent case, lowercase pageType
             pagetype = page.find('pageType').text.lower()
+            # Instead of complicating the data structure and adding extra logic
+            # on each block, just use a custom pagetype for anything where
+            # "addToAccessFormats" is set to false.
+            access_formats = page.find('addToAccessFormats').text.lower()
+            if access_formats == 'false':
+                pagetype = "skippable"
             self.pages[int(num)] = pagetype