Skip to content

Commit

Permalink
Merge pull request #29 from deborahgu/25-max-recursion-depth
Browse files Browse the repository at this point in the history
Max recursion depth and EOL hyphens.
  • Loading branch information
deborahgu committed Mar 27, 2018
2 parents b8e75d9 + 5500486 commit f24ecfa
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 10 deletions.
1 change: 1 addition & 0 deletions abbyy_to_epub3/create_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,7 @@ def craft_epub(self, epub_outfile="out.epub"):

# make the HTML chapters
self.craft_html()
self.logger.debug("Done assembling the HTML")

# Set the book's cover
self.book.set_cover(
Expand Down
27 changes: 17 additions & 10 deletions abbyy_to_epub3/parse_abbyy.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,20 @@ def add_last_text(blocks, page):
Given a list of blocks and the page number of the last page in the list,
mark up the last text block for that page in the list, if it exists.
"""
elem = blocks[-1]
if 'page_no' not in elem:
# On a page_no element, so at end of previous page
return
if elem['page_no'] == page:
if 'type' in elem and elem['type'] == 'text':
elem['last'] = True
elif len(blocks) > 1:
add_last_text(blocks[:-1], page)
while len(blocks) >= 1:
# Look for a page number in the last block our list
elem = blocks[-1]
# If page_no isn't here, we're at end of previous page
if 'page_no' not in elem:
return
# If page_no is here and matches, set elem to 'last'
if elem['page_no'] == page:
if 'type' in elem and elem['type'] == 'text':
elem['last'] = True
return
# redo loop with the list truncated by final element
blocks = blocks[:-1]
continue


class AbbyyParser(object):
Expand Down Expand Up @@ -331,10 +336,12 @@ def parse_content(self):
continue

# This is a good text chunk. Instantiate the block.
# The modern ABBYY parser is consistent in its handling
# of EOL hyphens, making it safe to strip them.
d = {
'type': 'Text',
'page_no': page_no,
'text': sanitize_xml(text),
'text': sanitize_xml(text).replace(\n', ''),
'role': role,
'style': self.paragraphs[para_id]
}
Expand Down

0 comments on commit f24ecfa

Please sign in to comment.