Merge pull request #30 from deborahgu/27-first-page-info-from-scandata

Fixes #27 -- Use scandata for cover image
deborahgu · Mar 30, 2018 · 61d0b5f · 61d0b5f
2 parents f24ecfa + fdc1041
commit 61d0b5f
Show file tree

Hide file tree

Showing 7 changed files with 89 additions and 32 deletions.
diff --git a/README.rst b/README.rst
@@ -156,10 +156,7 @@ The structure is assumed to be:
   ABBYY.
 * ``docname_jp2.zip`` unzips to a directory called ``docname_jp2``, which
   includes a number of documents in the format ``docname_####.jp2``. 
-
-  * ``docname_0000.jp2`` is scanner calibration.
-  * ``docname_0001.jp2`` is the cover image and the first image reference in the
-    ABBYY.
+* The scandata has hopefully marked up one leaf as 'Cover'. Failing that, we will use the first leaf marked 'Title', and failing that, the first leaf marked 'Normal'.
 * There is a single global metadata manifest file for the entire
   item named ``{item_identifier}_meta.xml``.
 * All of the other book specific files follow the form

diff --git a/abbyy_to_epub3/constants.py b/abbyy_to_epub3/constants.py
@@ -28,8 +28,8 @@
 
 # Some page types should always be skipped
 skippable_pages = [
-    'Cover',
-    'Copyright',
-    'Color Card',
-    'Title',
+    'cover',
+    'copyright',
+    'color card',
+    'title',
 ]
diff --git a/abbyy_to_epub3/create_epub.py b/abbyy_to_epub3/create_epub.py
@@ -138,7 +138,7 @@ def __init__(
         self.progression = ''  # page direction
         self.firsts = {}       # all first lines per-page
         self.lasts = {}        # all last lines per-page
-        self.pages = dict()    # page-by-page information from scandata
+        self.pages = OrderedDict()    # page-by-page information from scandata
 
         # are there headers, footers, or page numbers?
         self.headers_present = False
@@ -298,8 +298,33 @@ def extract_images(self):
         higher premium than disk space, so unzip the entire scan file into temp
         directory, instead of extracting only the needed images.
         """
-        cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_0001.jp2".format(
-            tmp=self.tmpdir, item_bookpath=self.item_bookpath
+
+        # Try to find a cover image. If nothing is tagged as 'Cover', use
+        # the first page tagged 'Title'. If nothing is tagged as 'Title',
+        # either, use the first page tagged 'Normal'. Self.pages is an
+        # OrderedDict so break as soon as you find something useful, and don't
+        # search the whole set of pages.
+
+        pages_iter = iter(self.pages)
+        for p in pages_iter:
+            if self.pages[p] == 'cover':
+                cover_leaf = p
+                break
+            elif self.pages[p] == 'title':
+                cover_leaf = p
+                break
+            elif self.pages[p] == 'normal':
+                cover_leaf = p
+                break
+        try:
+            cover_leaf
+        except NameError:
+            e = "No pages in scandata marked as Cover, Title, or Normal"
+            self.logger.error(e)
+            raise RuntimeError(e)
+
+        cover_file = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_000{num}.jp2".format(
+            tmp=self.tmpdir, item_bookpath=self.item_bookpath, num=cover_leaf
         )
         try:
             with ZipFile(self.jp2_zip) as f:
@@ -671,6 +696,8 @@ def craft_html(self):
         self.picnum = 1
         blocks_index = -1
         self.last_row = False
+        pagetype = ''
+        prev_pagetype = ''
 
         # Look for headers and page numbers
         # FR10 has markup but isn't reliable so look there as well
@@ -738,14 +765,20 @@ def craft_html(self):
         for block in self.blocks:
             blocks_index += 1
 
+            # Skip pages that  we don't want to include
             if 'type' not in block:
                 continue
+            # Get the pageType from scandata
             if (
                 'page_no' in block and
-                block['page_no'] in self.pages and
-                self.pages[block['page_no']] in skippable_pages
+                block['page_no'] in self.pages
             ):
+                prev_pagetype = pagetype
+                pagetype = self.pages[block['page_no']]
+            if pagetype in skippable_pages:
                 continue
+
+            # set the block style, if there is one
             if (
                 'style' in block and
                 'fontstyle' in block['style']
@@ -770,6 +803,23 @@ def craft_html(self):
                 )
             else:
                 fstyling = ''
+
+            # Make chapters for certain page types, for accessible navigation
+            pagetypes = {
+                'contents': 'Table of Contents',
+                'contributions': 'Contributions',
+                'copyright': 'Copyright Page',
+                'glossary': 'Glossary',
+                'index': 'Index',
+                'introduction': 'Introduction',
+                'preface': 'Preface',
+                'reference': 'Reference',
+                'title': 'Title Page',
+            }
+            if (pagetype in pagetypes and pagetype != prev_pagetype):
+                chapter_no += 1
+                chapter = self.make_chapter(pagetypes[pagetype], chapter_no)
+
             if block['type'] == 'Text':
                 text = block['text']
                 role = block['role']
@@ -928,12 +978,12 @@ def craft_epub(self, epub_outfile="out.epub"):
                     for line in infile:
                         outfile.write(line)
 
-            # Extract the page images and create the cover file
-            self.extract_images()
-
             # read in the page-by-page scandata file
             self.load_scandata_pages()
 
+            # Extract the page images and create the cover file
+            self.extract_images()
+
             # parse the ABBYY
             parser = AbbyyParser(
                 self.abbyy_file,

diff --git a/abbyy_to_epub3/parse_abbyy.py b/abbyy_to_epub3/parse_abbyy.py
@@ -290,7 +290,6 @@ def parse_content(self):
         d = {'page_no': page_no}
 
         self.pages[0].clear()    # clear the memory first
-        self.pages.pop(0)    # ignore the calibration page
         for page in self.pages:
             pagewidth = page.get('width')
             pageheight = page.get('height')
@@ -483,3 +482,7 @@ def parse_metadata(self):
                 self.metadata[term.tag].append(term.text)
             else:
                 self.metadata[term.tag] = [term.text, ]
+
+        # if the language isn't explicitly set, assume English
+        if 'language' not in self.metadata:
+            self.metadata['language'] = 'eng'
diff --git a/abbyy_to_epub3/parse_scandata.py b/abbyy_to_epub3/parse_scandata.py
@@ -44,5 +44,6 @@ def parse_scandata(self):
         pagelist = self.tree.findall("./pageData/page")
         for page in pagelist:
             num = page.get('leafNum')
-            pagetype = page.find('pageType').text
+            # In case contributors use inconsistent case, lowercase pageType
+            pagetype = page.find('pageType').text.lower()
             self.pages[int(num)] = pagetype
diff --git a/abbyy_to_epub3/tests/test_create_epub.py b/abbyy_to_epub3/tests/test_create_epub.py
@@ -51,12 +51,18 @@ def paragraphs(self):
 
     @pytest.fixture
     def book(self):
-        book = Ebook(base='testing')
+        book = Ebook(
+            item_identifier="item_identifier",
+            item_dir="{}/item_dir".format(TEST_DIR),
+            item_bookpath="item_bookpath"
+        )
         return book
 
     def test_create_Ebook(self, book):
         """ Instantiate an Ebook object. """
-        assert book.base == 'testing'
+        assert book.item_identifier == 'item_identifier'
+        assert book.item_dir == '{}/item_dir'.format(TEST_DIR)
+        assert book.item_bookpath == 'item_bookpath'
 
     def test_create_accessibility_metadata(self, book):
         """ Set the accessibility metadata of a default book. """
@@ -68,9 +74,9 @@ def test_create_accessibility_metadata(self, book):
         assert accessibility_metadata[0][1] == OrderedDict(
             [('property', 'schema:accessibilitySummary')])
 
-    def test_set_metadata(self, metadata, book, monkeypatch):
+    def test_set_metadata(self, metadata, book):
         """ Verifies metadata from the dict parsed from the metadata file """
-        monkeypatch.setattr(Ebook, 'metadata', metadata)
+        book.metadata = metadata
         book.set_metadata()
 
         assert book.book.title == 'Fire'
@@ -83,9 +89,9 @@ def test_craft_html_chapters(
         self, blocks, metadata, pages, book, monkeypatch
     ):
         """ tests chapters created from the parsed blocks """
-        monkeypatch.setattr(Ebook, 'metadata', metadata)
-        monkeypatch.setattr(Ebook, 'blocks', blocks)
-        monkeypatch.setattr(Ebook, 'pages', pages)
+        book.metadata = metadata
+        book.blocks = blocks
+        book.pages = pages
         monkeypatch.setattr(Ebook, 'make_image', lambda Ebook, str: '<img />')
         book.craft_html()
 
@@ -97,15 +103,17 @@ def test_craft_html_chapters(
         ) in book.chapters[1].content
         assert book.chapters[1].file_name == 'chap_0002.xhtml'
 
-    def test_make_chapter(self, metadata, book, monkeypatch):
+    def test_make_chapter(self, metadata, book):
         """
-        create a chapter. By default the book has an opening section.
+        create a chapter.
         """
-        monkeypatch.setattr(Ebook, 'metadata', metadata)
-        book.make_chapter("Chapter name", 3)
+        book.metadata = metadata
+        book.make_chapter("Chapter One", 1)
+        book.make_chapter("Chapter Two", 2)
+        book.make_chapter("Chapter Three", 3)
 
         assert len(book.chapters) == 3
         assert len(book.book.items) == 3
-        assert book.chapters[2].title == "Chapter name"
+        assert book.chapters[2].title == "Chapter Three"
         assert book.chapters[2].content == u''
         assert book.chapters[2].file_name == 'chap_0003.xhtml'
diff --git a/abbyy_to_epub3/tests/test_parse_abbyy.py b/abbyy_to_epub3/tests/test_parse_abbyy.py
@@ -17,12 +17,10 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-from ebooklib import epub
 import pytest
 
 from abbyy_to_epub3.parse_abbyy import AbbyyParser, sanitize_xml
 from abbyy_to_epub3.settings import TEST_DIR
-from abbyy_to_epub3 import constants
 
 
 class TestAbbyyParser(object):