Skip to content

Commit

Permalink
Merge pull request #30 from deborahgu/27-first-page-info-from-scandata
Browse files Browse the repository at this point in the history
Fixes #27 -- Use scandata for cover image
  • Loading branch information
deborahgu committed Mar 30, 2018
2 parents f24ecfa + fdc1041 commit 61d0b5f
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 32 deletions.
5 changes: 1 addition & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,7 @@ The structure is assumed to be:
ABBYY.
* ``docname_jp2.zip`` unzips to a directory called ``docname_jp2``, which
includes a number of documents in the format ``docname_####.jp2``.

* ``docname_0000.jp2`` is scanner calibration.
* ``docname_0001.jp2`` is the cover image and the first image reference in the
ABBYY.
* The scandata has hopefully marked up one leaf as 'Cover'. Failing that, we will use the first leaf marked 'Title', and failing that, the first leaf marked 'Normal'.
* There is a single global metadata manifest file for the entire
item named ``{item_identifier}_meta.xml``.
* All of the other book specific files follow the form
Expand Down
8 changes: 4 additions & 4 deletions abbyy_to_epub3/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@

# Some page types should always be skipped
skippable_pages = [
'Cover',
'Copyright',
'Color Card',
'Title',
'cover',
'copyright',
'color card',
'title',
]
66 changes: 58 additions & 8 deletions abbyy_to_epub3/create_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __init__(
self.progression = '' # page direction
self.firsts = {} # all first lines per-page
self.lasts = {} # all last lines per-page
self.pages = dict() # page-by-page information from scandata
self.pages = OrderedDict() # page-by-page information from scandata

# are there headers, footers, or page numbers?
self.headers_present = False
Expand Down Expand Up @@ -298,8 +298,33 @@ def extract_images(self):
higher premium than disk space, so unzip the entire scan file into temp
directory, instead of extracting only the needed images.
"""
cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_0001.jp2".format(
tmp=self.tmpdir, item_bookpath=self.item_bookpath

# Try to find a cover image. If nothing is tagged as 'Cover', use
# the first page tagged 'Title'. If nothing is tagged as 'Title',
# either, use the first page tagged 'Normal'. Self.pages is an
# OrderedDict so break as soon as you find something useful, and don't
# search the whole set of pages.

pages_iter = iter(self.pages)
for p in pages_iter:
if self.pages[p] == 'cover':
cover_leaf = p
break
elif self.pages[p] == 'title':
cover_leaf = p
break
elif self.pages[p] == 'normal':
cover_leaf = p
break
try:
cover_leaf
except NameError:
e = "No pages in scandata marked as Cover, Title, or Normal"
self.logger.error(e)
raise RuntimeError(e)

cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_000{num}.jp2".format(
tmp=self.tmpdir, item_bookpath=self.item_bookpath, num=cover_leaf
)
try:
with ZipFile(self.jp2_zip) as f:
Expand Down Expand Up @@ -671,6 +696,8 @@ def craft_html(self):
self.picnum = 1
blocks_index = -1
self.last_row = False
pagetype = ''
prev_pagetype = ''

# Look for headers and page numbers
# FR10 has markup but isn't reliable so look there as well
Expand Down Expand Up @@ -738,14 +765,20 @@ def craft_html(self):
for block in self.blocks:
blocks_index += 1

# Skip pages that we don't want to include
if 'type' not in block:
continue
# Get the pageType from scandata
if (
'page_no' in block and
block['page_no'] in self.pages and
self.pages[block['page_no']] in skippable_pages
block['page_no'] in self.pages
):
prev_pagetype = pagetype
pagetype = self.pages[block['page_no']]
if pagetype in skippable_pages:
continue

# set the block style, if there is one
if (
'style' in block and
'fontstyle' in block['style']
Expand All @@ -770,6 +803,23 @@ def craft_html(self):
)
else:
fstyling = ''

# Make chapters for certain page types, for accessible navigation
pagetypes = {
'contents': 'Table of Contents',
'contributions': 'Contributions',
'copyright': 'Copyright Page',
'glossary': 'Glossary',
'index': 'Index',
'introduction': 'Introduction',
'preface': 'Preface',
'reference': 'Reference',
'title': 'Title Page',
}
if (pagetype in pagetypes and pagetype != prev_pagetype):
chapter_no += 1
chapter = self.make_chapter(pagetypes[pagetype], chapter_no)

if block['type'] == 'Text':
text = block['text']
role = block['role']
Expand Down Expand Up @@ -928,12 +978,12 @@ def craft_epub(self, epub_outfile="out.epub"):
for line in infile:
outfile.write(line)

# Extract the page images and create the cover file
self.extract_images()

# read in the page-by-page scandata file
self.load_scandata_pages()

# Extract the page images and create the cover file
self.extract_images()

# parse the ABBYY
parser = AbbyyParser(
self.abbyy_file,
Expand Down
5 changes: 4 additions & 1 deletion abbyy_to_epub3/parse_abbyy.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,6 @@ def parse_content(self):
d = {'page_no': page_no}

self.pages[0].clear() # clear the memory first
self.pages.pop(0) # ignore the calibration page
for page in self.pages:
pagewidth = page.get('width')
pageheight = page.get('height')
Expand Down Expand Up @@ -483,3 +482,7 @@ def parse_metadata(self):
self.metadata[term.tag].append(term.text)
else:
self.metadata[term.tag] = [term.text, ]

# if the language isn't explicitly set, assume English
if 'language' not in self.metadata:
self.metadata['language'] = 'eng'
3 changes: 2 additions & 1 deletion abbyy_to_epub3/parse_scandata.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,6 @@ def parse_scandata(self):
pagelist = self.tree.findall("./pageData/page")
for page in pagelist:
num = page.get('leafNum')
pagetype = page.find('pageType').text
# In case contributors use inconsistent case, lowercase pageType
pagetype = page.find('pageType').text.lower()
self.pages[int(num)] = pagetype
32 changes: 20 additions & 12 deletions abbyy_to_epub3/tests/test_create_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,18 @@ def paragraphs(self):

@pytest.fixture
def book(self):
book = Ebook(base='testing')
book = Ebook(
item_identifier="item_identifier",
item_dir="{}/item_dir".format(TEST_DIR),
item_bookpath="item_bookpath"
)
return book

def test_create_Ebook(self, book):
""" Instantiate an Ebook object. """
assert book.base == 'testing'
assert book.item_identifier == 'item_identifier'
assert book.item_dir == '{}/item_dir'.format(TEST_DIR)
assert book.item_bookpath == 'item_bookpath'

def test_create_accessibility_metadata(self, book):
""" Set the accessibility metadata of a default book. """
Expand All @@ -68,9 +74,9 @@ def test_create_accessibility_metadata(self, book):
assert accessibility_metadata[0][1] == OrderedDict(
[('property', 'schema:accessibilitySummary')])

def test_set_metadata(self, metadata, book, monkeypatch):
def test_set_metadata(self, metadata, book):
""" Verifies metadata from the dict parsed from the metadata file """
monkeypatch.setattr(Ebook, 'metadata', metadata)
book.metadata = metadata
book.set_metadata()

assert book.book.title == 'Fire'
Expand All @@ -83,9 +89,9 @@ def test_craft_html_chapters(
self, blocks, metadata, pages, book, monkeypatch
):
""" tests chapters created from the parsed blocks """
monkeypatch.setattr(Ebook, 'metadata', metadata)
monkeypatch.setattr(Ebook, 'blocks', blocks)
monkeypatch.setattr(Ebook, 'pages', pages)
book.metadata = metadata
book.blocks = blocks
book.pages = pages
monkeypatch.setattr(Ebook, 'make_image', lambda Ebook, str: '<img />')
book.craft_html()

Expand All @@ -97,15 +103,17 @@ def test_craft_html_chapters(
) in book.chapters[1].content
assert book.chapters[1].file_name == 'chap_0002.xhtml'

def test_make_chapter(self, metadata, book, monkeypatch):
def test_make_chapter(self, metadata, book):
"""
create a chapter. By default the book has an opening section.
create a chapter.
"""
monkeypatch.setattr(Ebook, 'metadata', metadata)
book.make_chapter("Chapter name", 3)
book.metadata = metadata
book.make_chapter("Chapter One", 1)
book.make_chapter("Chapter Two", 2)
book.make_chapter("Chapter Three", 3)

assert len(book.chapters) == 3
assert len(book.book.items) == 3
assert book.chapters[2].title == "Chapter name"
assert book.chapters[2].title == "Chapter Three"
assert book.chapters[2].content == u''
assert book.chapters[2].file_name == 'chap_0003.xhtml'
2 changes: 0 additions & 2 deletions abbyy_to_epub3/tests/test_parse_abbyy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from ebooklib import epub
import pytest

from abbyy_to_epub3.parse_abbyy import AbbyyParser, sanitize_xml
from abbyy_to_epub3.settings import TEST_DIR
from abbyy_to_epub3 import constants


class TestAbbyyParser(object):
Expand Down

0 comments on commit 61d0b5f

Please sign in to comment.