Merge f905a8a into f39a393

felipeochoa · Dec 22, 2018 · e166d8c · e166d8c
2 parents f39a393 + f905a8a
commit e166d8c
Show file tree

Hide file tree

Showing 3 changed files with 173 additions and 32 deletions.
diff --git a/minecart/content.py b/minecart/content.py
@@ -26,7 +26,11 @@
 
 import six
 
+from pdfminer.pdftypes import PDFObjRef
+from pdfminer.psparser import PSLiteral
 from pdfminer.psparser import LIT
+from . import color
+
 JPEG_FILTERS = (LIT('DCTDecode'), LIT('DCT'), LIT('JPXDecode'))
 
 
@@ -252,31 +256,79 @@ def __init__(self, ctm, obj):
     def get_bbox(self):
         return self.bbox
 
-    def as_pil(self):
-        """
-        Return the image data in a `PIL.Image` object.
+    def _decode_dct(self, lti, cspace, image_data):
+        '''
+        For an image that's an embedded JPEG (e.g. packed with the `DCTDecode` filter),
+        extract the associated colorspace (if any), apply it, open the image stream as a
+        buffer, and return an associated PIL image.
+        '''
+        import PIL.Image
+        import PIL.ImageCms
+
+        icc_profile = None
+
+        # Embedded color profile.
+        # JPEG has it's own embedded colorspace otherwise.
+        if isinstance(cspace, PDFObjRef):
+            resolved = cspace.resolve()
+
+            # Note: The colorspace flag is a pdfminer.psparser.PSLiteral, but it's
+            # __repr__ returns a string. Confusing as fuck. Anyways, apparently
+            # the actual value is without a leading /, so when we access it
+            # via .name, that's removed. Sigh.
+            cspace_mode = resolved[0].name
+
+            if cspace_mode == 'ICCBased':
+                assert len(resolved) == 2, "ICCBased color space profiles should have only one entry. What?"
+                cs_data = resolved[1].resolve()
+                icc_profile = cs_data.get_data()
+
+            elif cspace_mode == "Indexed":
+
+                resd = []
+                for tmp in resolved[1:]:
+
+                    if isinstance(tmp, PDFObjRef):
+                        resd.append(tmp.resolve())
+                    elif isinstance(tmp, PSLiteral):
+                        resd.append(tmp.name)
+                    else:
+                        resd.append(tmp)
+
+                # Base color map,
+                # hval (maximum index in the colorspace),
+                #  lookup table in colorspace `base` for each value in 0 -> hval
+                base, hval, lookup = resd
+                if hval == 0:
+                    # Return nothing, since there are no colors anyways.
+                    return None
+                else:
+
+                    raise pdfminer.pdftypes.PDFNotImplementedError(
+                        "Interpreting non-empty indexed colorspaces not implemented yet!")
 
-        Requires `pillow` to be installed.
+        image = PIL.Image.open(io.BytesIO(image_data))
 
-        """
+        # If we have a ICC profile decode it and apply it to the image.
+        # Return type is always sRGB because lazy.
+        if icc_profile:
+            in_profile = io.BytesIO(icc_profile)
+            prof = PIL.ImageCms.ImageCmsProfile(in_profile)
+            srgb = PIL.ImageCms.createProfile('sRGB')
+
+            image = PIL.ImageCms.profileToProfile(image, prof, srgb)
+
+        return image
+
+    def _decode_ppm(self, lti, colorspace, image_data):
+        '''
+        Given a embedded bitmap image, decode it as well as we can.
+        '''
         import PIL.Image
-        try:
-            image_data = self.obj.get_data()
-        except pdfminer.pdftypes.PDFNotImplementedError:
-            filters = self.obj.get_filters()
-            if len(filters) == 1 and filters[0] in JPEG_FILTERS:
-                # FIXME: ColorSpace in JPEG2000 should be overridden by the
-                # ColorSpace in the Image dictionary
-                image_data = io.BytesIO(self.obj.rawdata)
-                return PIL.Image.open(image_data)
-            raise  # We either can't handle the predictor or the filter
+        import PIL.ImageCms
+
+        icc_profile = None
 
-        lti = pdfminer.layout.LTImage("", self.obj, self.get_bbox())
-        # The PDF spec allows non-JPEG images to have 1, 2, 4, 8 or 16 bits
-        if isinstance(lti.colorspace, list):
-            colorspace = str(lti.colorspace[0])[1:]  # strip leading /
-        else:
-            colorspace = str(lti.colorspace)[1:]  # strip leading /
         if colorspace in ('DeviceRGB', 'CalRGB', 'RGB'):
             mode = "RGB"
             samples = 3
@@ -336,6 +388,7 @@ def as_pil(self):
             else:
                 raise pdfminer.pdftypes.PDFNotImplementedError(
                     "RGB images with %d-bit samples are not supported" % lti.bits)
+
         elif colorspace in ('CalGray', 'DeviceGray'):
             mode = 'L'
             samples = 1
@@ -349,10 +402,19 @@ def as_pil(self):
                 rawmode = "L"
             elif lti.bits == 16:
                 rawmode = "L;16"
+
+            else:
+
+                raise pdfminer.pdftypes.PDFNotImplementedError(
+                    "Non ICC colorspace embedded images not implemented. Colorspace type: %s"
+                        % (colorspace, ))
+
+
         elif colorspace in ('DeviceCMYK', 'CMYK'):
             if lti.bits != 8:
                 raise pdfminer.pdftypes.PDFNotImplementedError(
                     "PIL only supports 8-bit CMYK")
+
             # TODO: Upcast the 1/2/4 bit image to 8 bits.
             # Can PIL handle 16-bit CMYK?
             mode = "CMYK"
@@ -364,14 +426,92 @@ def as_pil(self):
         # The PDF spec requires each row of data to be 0-padded to be at a
         # byte boundary. stride is the distance in bytes between consecutive
         # rows of image data.
-        stride = (lti.srcsize[0] * lti.bits * samples + 7) // 8
-        image = PIL.Image.open(io.BytesIO(image_data))
+        # stride = (lti.srcsize[0] * lti.bits * samples + 7) // 8
+
+
+        if lti.filter == 'FlateDecode':
+            pass
+        else:
+            raise pdfminer.pdftypes.PDFNotImplementedError(
+                "Colorspace %r is not supported" % colorspace)
+
+        # if im_typ == 'data':
+        #     import pdb
+        #     pdb.set_trace()
+
+        # image = PIL.Image.frombuffer(mode=mode, size=lti.size, data=image_data, decoder_name="raw")
+        image = PIL.Image.frombuffer(mode, lti.size, image_data, 'raw', mode, 0, -1)
+        image = image.transpose(PIL.Image.FLIP_TOP_BOTTOM)
+
+        # If we have a ICC profile decode it and apply it to the image.
+        # Return type is always sRGB because lazy.
+        if icc_profile:
+            in_profile = io.BytesIO(icc_profile)
+            prof = PIL.ImageCms.ImageCmsProfile(in_profile)
+            srgb = PIL.ImageCms.createProfile('sRGB')
+
+            image = PIL.ImageCms.profileToProfile(image, prof, srgb)
+
 
         return image
         # TODO: implement Decode array
         # TODO: implement image mask
 
 
+    def as_pil(self):
+        """
+        Return the image data in a `PIL.Image` object.
+
+        Requires `pillow` to be installed.
+
+        """
+        import PIL.Image
+        import PIL.ImageCms
+        try:
+            image_data = self.obj.get_data()
+        except pdfminer.pdftypes.PDFNotImplementedError:
+            filters = self.obj.get_filters()
+            if len(filters) == 1 and filters[0] in JPEG_FILTERS:
+                # FIXME: ColorSpace in JPEG2000 should be overridden by the
+                # ColorSpace in the Image dictionary
+                image_data = io.BytesIO(self.obj.rawdata)
+                return PIL.Image.open(image_data)
+            raise  # We either can't handle the predictor or the filter
+
+
+        lti = pdfminer.layout.LTImage("", self.obj, self.get_bbox())
+        # The PDF spec allows non-JPEG images to have 1, 2, 4, 8 or 16 bits
+        if isinstance(lti.colorspace, list):
+            assert len(lti.colorspace) == 1
+            cs_inst = lti.colorspace[0]
+        else:
+            cs_inst = lti.colorspace
+
+
+        # DCTDecode is a plain old jpeg.
+        if lti.filter == 'DCTDecode' or lti.filter == 'JPXDecode':
+            return self._decode_dct(lti, cs_inst, image_data)
+
+        # FlateDecode seems to be the filter of choice on bitmaps.
+        elif lti.filter == 'FlateDecode' or lti.filter == 'LZWDecode' or lti.filter == 'RunLengthDecode':
+
+            if isinstance(cs_inst, PDFObjRef):
+                raise pdfminer.pdftypes.PDFNotImplementedError(
+                    "Bitmap images with embedded colorspaces not supported at the moment.")
+
+
+            assert isinstance(cs_inst, PSLiteral)
+            colorspace = cs_inst.name
+
+            return self._decode_ppm(lti, colorspace, image_data)
+            return self._decode_dct(lti, cs_inst, image_data)
+
+        else:
+            raise pdfminer.pdftypes.PDFNotImplementedError(
+                "Embedded image compressed with %s filter not supported!." % lti.filter)
+
+
+
 class Lettering(six.text_type, GraphicsObject):
 
     """

diff --git a/minecart/miner.py b/minecart/miner.py
@@ -8,6 +8,8 @@
 import pdfminer.pdfdevice
 import pdfminer.pdfinterp
 import pdfminer.pdfparser
+import pdfminer.pdfpage
+import pdfminer.pdfdocument
 import pdfminer.pdftypes
 import pdfminer.utils
 import pdfminer.pdfcolor
@@ -255,20 +257,20 @@ def render_string_horizontal(self, *args):
     def render_string_vertical(self, *args):
         return self.render_string_hv('vertical', *args)
 
-    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
+    def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
         # Essentials copied from
         # pdfminer.converter.PDFLayoutAnalyzer.render_char
         text = font.to_unichr(cid)
         textwidth = font.char_width(cid)
         textdisp = font.char_disp(cid)
         item = pdfminer.layout.LTChar(matrix, font, fontsize, scaling, rise,
-                                      text, textwidth, textdisp)
+                                      text, textwidth, textdisp, ncs, graphicstate)
         self.str_container.add(item)
         return item.adv
 
     def render_string_hv(self, hv, seq, matrix, vec, font, fontsize,
                          scaling, charspace, wordspace, rise,
-                         dxscale):
+                         dxscale, ncs, graphicstate):
         """
         Calculate the bounding box in user coordinates for a string.
 
@@ -305,7 +307,7 @@ def render_string_hv(self, hv, seq, matrix, vec, font, fontsize,
                         vec[hv] += charspace
                     vec[hv] += self.render_char(
                         pdfminer.utils.translate_matrix(matrix, vec),
-                        font, fontsize, scaling, rise, cid)
+                        font, fontsize, scaling, rise, cid, ncs, graphicstate)
                     if cid == 32 and wordspace:
                         vec[hv] += wordspace
                     needcharspace = True
@@ -327,13 +329,12 @@ def __init__(self, pdffile):
         self.device = DeviceLoader(res_mgr)
         self.interpreter = ColoredInterpreter(res_mgr, self.device)
         self.parser = pdfminer.pdfparser.PDFParser(pdffile)
-        self.doc = pdfminer.pdfparser.PDFDocument(caching=True)
+        self.doc = pdfminer.pdfdocument.PDFDocument(parser=self.parser, caching=True)
         self.parser.set_document(self.doc)
-        self.doc.set_parser(self.parser)
 
     def iter_pages(self):
         "Iterate through all the pages in a document."
-        for page in self.doc.get_pages():
+        for page in pdfminer.pdfpage.PDFPage.create_pages(self.doc):
             self.interpreter.process_page(page)
             yield self.device.page
 
@@ -345,7 +346,7 @@ def get_page(self, num):
         display order, not the numbering system used in the document.
 
         """
-        for i, page in enumerate(self.doc.get_pages()):
+        for i, page in enumerate(pdfminer.pdfpage.PDFPage.create_pages(self.doc)):
             if i == num:
                 self.interpreter.process_page(page)
                 return self.device.page
diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@
         'License :: OSI Approved :: MIT License',
     ],
     keywords='pdf pdfminer extract mining images',
-    install_requires=['pdfminer3k', 'six'],
+    install_requires=['pdfminer.six', 'six'],
     extras_require={
         'PIL': ['Pillow'],
     },