Skip to content

Commit

Permalink
Merge pull request #32 from deborahgu/31-addtoaccessformats
Browse files Browse the repository at this point in the history
FIXES #31. Uses Scandata to ignore pages w/ addToAccessFormats=False. Also:
  • Loading branch information
deborahgu committed Apr 3, 2018
2 parents 61d0b5f + 1bc0de1 commit eac106c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 19 deletions.
4 changes: 4 additions & 0 deletions abbyy_to_epub3/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,13 @@
}

# Some page types should always be skipped
# Instead of complicating the data structure and adding extra logic
# on each block, just use a custom pagetype for anything where
# "addToAccessFormats" is set to false. This is 'skippable.'
skippable_pages = [
'cover',
'copyright',
'color card',
'skippable',
'title',
]
24 changes: 20 additions & 4 deletions abbyy_to_epub3/create_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,8 @@ def extract_images(self):
self.logger.error(e)
raise RuntimeError(e)

cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_000{num}.jp2".format(
# pad out the filename to four digits
cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_{num:0>4}.jp2".format(
tmp=self.tmpdir, item_bookpath=self.item_bookpath, num=cover_leaf
)
try:
Expand All @@ -337,7 +338,11 @@ def extract_images(self):
# convert the JP2K file into a usable format for the cover
f, e = os.path.splitext(os.path.basename(cover_file))
imageobj = ImageFactory(self.image_processor)
imageobj.crop_image(cover_file, self.cover_img)
try:
imageobj.crop_image(cover_file, self.cover_img)
except RuntimeError as e:
# for failed image creation, keep processing the epub
self.logger.error(e)

def image_dim(self, block):
"""
Expand Down Expand Up @@ -394,7 +399,12 @@ def make_image(self, block):

# make the image:
imageobj = ImageFactory(self.image_processor)
imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim)
try:
imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim)
except RuntimeError as e:
# for failed image creation, keep processing the epub
self.logger.error(e)
return ''
epubimage = epub.EpubImage()
epubimage.file_name = in_epub_imagefile
with open(outfile, 'rb') as f:
Expand Down Expand Up @@ -775,6 +785,9 @@ def craft_html(self):
):
prev_pagetype = pagetype
pagetype = self.pages[block['page_no']]
else:
# Treat it as Normal if it's not set
pagetype = 'Normal'
if pagetype in skippable_pages:
continue

Expand Down Expand Up @@ -918,7 +931,10 @@ def craft_html(self):
chapter.content += ebooklib_utils.create_pagebreak(
str(block['text'])
)
elif block['type'] == 'Picture':
elif (
block['type'] == 'Picture' and
pagetype != 'Cover'
):
# Image
content = self.make_image(block)
if content:
Expand Down
23 changes: 8 additions & 15 deletions abbyy_to_epub3/image_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,13 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
cmd, stdout=subprocess.DEVNULL, check=True
)
except subprocess.CalledProcessError as e:
self.logger.warning(
raise RuntimeError(
"Can't save cropped image: {}".format(e)
)
return
else:
self.logger.warning(
raise RuntimeError(
"Can't crop in Kakadu without page dimensions"
)
return
else:
# without dimensions, save the entire uncropped image
cmd = [
Expand All @@ -94,10 +92,9 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
cmd, stdout=subprocess.DEVNULL, check=True
)
except subprocess.CalledProcessError as e:
self.logger.warning(
"Can't save uncropped image: {}".format(e)
raise RuntimeError(
"Can't open image {}: {}".format(origfile, e)
)
return

class PillowProcessor(ImageProcessor):

Expand All @@ -113,16 +110,12 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
try:
i = Image.open(origfile)
except IOError as e:
self.logger.error(
"Can't open image {}: {}".format(origfile, e)
)
raise Exception(
"Can't open image {}: {}".format(origfile, e)
)
raise RuntimeError(
"Can't open image {}: {}".format(origfile, e))
try:
i.crop(dim).save(outfile)
except IOError as e:
self.logger.warning(
raise RuntimeError(
"Can't crop image {} & save to {}: {}".format(
origfile, outfile, e
)
Expand All @@ -132,7 +125,7 @@ def crop_image(self, origfile, outfile, dim=False, pagedim=False):
try:
Image.open(origfile).save(outfile)
except IOError as e:
self.logger.warning(
raise RuntimeError(
"Cannot create cover file: {}".format(e)
)

Expand Down
6 changes: 6 additions & 0 deletions abbyy_to_epub3/parse_scandata.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,10 @@ def parse_scandata(self):
num = page.get('leafNum')
# In case contributors use inconsistent case, lowercase pageType
pagetype = page.find('pageType').text.lower()
# Instead of complicating the data structure and adding extra logic
# on each block, just use a custom pagetype for anything where
# "addToAccessFormats" is set to false.
access_formats = page.find('addToAccessFormats').text.lower()
if access_formats == 'false':
pagetype = "skippable"
self.pages[int(num)] = pagetype

0 comments on commit eac106c

Please sign in to comment.