Skip to content

Commit

Permalink
Merge f905a8a into f39a393
Browse files Browse the repository at this point in the history
  • Loading branch information
fake-name committed Dec 22, 2018
2 parents f39a393 + f905a8a commit e166d8c
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 32 deletions.
186 changes: 163 additions & 23 deletions minecart/content.py
Expand Up @@ -26,7 +26,11 @@

import six

from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral
from pdfminer.psparser import LIT
from . import color

JPEG_FILTERS = (LIT('DCTDecode'), LIT('DCT'), LIT('JPXDecode'))


Expand Down Expand Up @@ -252,31 +256,79 @@ def __init__(self, ctm, obj):
def get_bbox(self):
return self.bbox

def as_pil(self):
"""
Return the image data in a `PIL.Image` object.
def _decode_dct(self, lti, cspace, image_data):
'''
For an image that's an embedded JPEG (e.g. packed with the `DCTDecode` filter),
extract the associated colorspace (if any), apply it, open the image stream as a
buffer, and return an associated PIL image.
'''
import PIL.Image
import PIL.ImageCms

icc_profile = None

# Embedded color profile.
# JPEG has it's own embedded colorspace otherwise.
if isinstance(cspace, PDFObjRef):
resolved = cspace.resolve()

# Note: The colorspace flag is a pdfminer.psparser.PSLiteral, but it's
# __repr__ returns a string. Confusing as fuck. Anyways, apparently
# the actual value is without a leading /, so when we access it
# via .name, that's removed. Sigh.
cspace_mode = resolved[0].name

if cspace_mode == 'ICCBased':
assert len(resolved) == 2, "ICCBased color space profiles should have only one entry. What?"
cs_data = resolved[1].resolve()
icc_profile = cs_data.get_data()

elif cspace_mode == "Indexed":

resd = []
for tmp in resolved[1:]:

if isinstance(tmp, PDFObjRef):
resd.append(tmp.resolve())
elif isinstance(tmp, PSLiteral):
resd.append(tmp.name)
else:
resd.append(tmp)

# Base color map,
# hval (maximum index in the colorspace),
# lookup table in colorspace `base` for each value in 0 -> hval
base, hval, lookup = resd
if hval == 0:
# Return nothing, since there are no colors anyways.
return None
else:

raise pdfminer.pdftypes.PDFNotImplementedError(
"Interpreting non-empty indexed colorspaces not implemented yet!")

Requires `pillow` to be installed.
image = PIL.Image.open(io.BytesIO(image_data))

"""
# If we have a ICC profile decode it and apply it to the image.
# Return type is always sRGB because lazy.
if icc_profile:
in_profile = io.BytesIO(icc_profile)
prof = PIL.ImageCms.ImageCmsProfile(in_profile)
srgb = PIL.ImageCms.createProfile('sRGB')

image = PIL.ImageCms.profileToProfile(image, prof, srgb)

return image

def _decode_ppm(self, lti, colorspace, image_data):
'''
Given a embedded bitmap image, decode it as well as we can.
'''
import PIL.Image
try:
image_data = self.obj.get_data()
except pdfminer.pdftypes.PDFNotImplementedError:
filters = self.obj.get_filters()
if len(filters) == 1 and filters[0] in JPEG_FILTERS:
# FIXME: ColorSpace in JPEG2000 should be overridden by the
# ColorSpace in the Image dictionary
image_data = io.BytesIO(self.obj.rawdata)
return PIL.Image.open(image_data)
raise # We either can't handle the predictor or the filter
import PIL.ImageCms

icc_profile = None

lti = pdfminer.layout.LTImage("", self.obj, self.get_bbox())
# The PDF spec allows non-JPEG images to have 1, 2, 4, 8 or 16 bits
if isinstance(lti.colorspace, list):
colorspace = str(lti.colorspace[0])[1:] # strip leading /
else:
colorspace = str(lti.colorspace)[1:] # strip leading /
if colorspace in ('DeviceRGB', 'CalRGB', 'RGB'):
mode = "RGB"
samples = 3
Expand Down Expand Up @@ -336,6 +388,7 @@ def as_pil(self):
else:
raise pdfminer.pdftypes.PDFNotImplementedError(
"RGB images with %d-bit samples are not supported" % lti.bits)

elif colorspace in ('CalGray', 'DeviceGray'):
mode = 'L'
samples = 1
Expand All @@ -349,10 +402,19 @@ def as_pil(self):
rawmode = "L"
elif lti.bits == 16:
rawmode = "L;16"

else:

raise pdfminer.pdftypes.PDFNotImplementedError(
"Non ICC colorspace embedded images not implemented. Colorspace type: %s"
% (colorspace, ))


elif colorspace in ('DeviceCMYK', 'CMYK'):
if lti.bits != 8:
raise pdfminer.pdftypes.PDFNotImplementedError(
"PIL only supports 8-bit CMYK")

# TODO: Upcast the 1/2/4 bit image to 8 bits.
# Can PIL handle 16-bit CMYK?
mode = "CMYK"
Expand All @@ -364,14 +426,92 @@ def as_pil(self):
# The PDF spec requires each row of data to be 0-padded to be at a
# byte boundary. stride is the distance in bytes between consecutive
# rows of image data.
stride = (lti.srcsize[0] * lti.bits * samples + 7) // 8
image = PIL.Image.open(io.BytesIO(image_data))
# stride = (lti.srcsize[0] * lti.bits * samples + 7) // 8


if lti.filter == 'FlateDecode':
pass
else:
raise pdfminer.pdftypes.PDFNotImplementedError(
"Colorspace %r is not supported" % colorspace)

# if im_typ == 'data':
# import pdb
# pdb.set_trace()

# image = PIL.Image.frombuffer(mode=mode, size=lti.size, data=image_data, decoder_name="raw")
image = PIL.Image.frombuffer(mode, lti.size, image_data, 'raw', mode, 0, -1)
image = image.transpose(PIL.Image.FLIP_TOP_BOTTOM)

# If we have a ICC profile decode it and apply it to the image.
# Return type is always sRGB because lazy.
if icc_profile:
in_profile = io.BytesIO(icc_profile)
prof = PIL.ImageCms.ImageCmsProfile(in_profile)
srgb = PIL.ImageCms.createProfile('sRGB')

image = PIL.ImageCms.profileToProfile(image, prof, srgb)


return image
# TODO: implement Decode array
# TODO: implement image mask


def as_pil(self):
"""
Return the image data in a `PIL.Image` object.
Requires `pillow` to be installed.
"""
import PIL.Image
import PIL.ImageCms
try:
image_data = self.obj.get_data()
except pdfminer.pdftypes.PDFNotImplementedError:
filters = self.obj.get_filters()
if len(filters) == 1 and filters[0] in JPEG_FILTERS:
# FIXME: ColorSpace in JPEG2000 should be overridden by the
# ColorSpace in the Image dictionary
image_data = io.BytesIO(self.obj.rawdata)
return PIL.Image.open(image_data)
raise # We either can't handle the predictor or the filter


lti = pdfminer.layout.LTImage("", self.obj, self.get_bbox())
# The PDF spec allows non-JPEG images to have 1, 2, 4, 8 or 16 bits
if isinstance(lti.colorspace, list):
assert len(lti.colorspace) == 1
cs_inst = lti.colorspace[0]
else:
cs_inst = lti.colorspace


# DCTDecode is a plain old jpeg.
if lti.filter == 'DCTDecode' or lti.filter == 'JPXDecode':
return self._decode_dct(lti, cs_inst, image_data)

# FlateDecode seems to be the filter of choice on bitmaps.
elif lti.filter == 'FlateDecode' or lti.filter == 'LZWDecode' or lti.filter == 'RunLengthDecode':

if isinstance(cs_inst, PDFObjRef):
raise pdfminer.pdftypes.PDFNotImplementedError(
"Bitmap images with embedded colorspaces not supported at the moment.")


assert isinstance(cs_inst, PSLiteral)
colorspace = cs_inst.name

return self._decode_ppm(lti, colorspace, image_data)
return self._decode_dct(lti, cs_inst, image_data)

else:
raise pdfminer.pdftypes.PDFNotImplementedError(
"Embedded image compressed with %s filter not supported!." % lti.filter)



class Lettering(six.text_type, GraphicsObject):

"""
Expand Down
17 changes: 9 additions & 8 deletions minecart/miner.py
Expand Up @@ -8,6 +8,8 @@
import pdfminer.pdfdevice
import pdfminer.pdfinterp
import pdfminer.pdfparser
import pdfminer.pdfpage
import pdfminer.pdfdocument
import pdfminer.pdftypes
import pdfminer.utils
import pdfminer.pdfcolor
Expand Down Expand Up @@ -255,20 +257,20 @@ def render_string_horizontal(self, *args):
def render_string_vertical(self, *args):
return self.render_string_hv('vertical', *args)

def render_char(self, matrix, font, fontsize, scaling, rise, cid):
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
# Essentials copied from
# pdfminer.converter.PDFLayoutAnalyzer.render_char
text = font.to_unichr(cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
item = pdfminer.layout.LTChar(matrix, font, fontsize, scaling, rise,
text, textwidth, textdisp)
text, textwidth, textdisp, ncs, graphicstate)
self.str_container.add(item)
return item.adv

def render_string_hv(self, hv, seq, matrix, vec, font, fontsize,
scaling, charspace, wordspace, rise,
dxscale):
dxscale, ncs, graphicstate):
"""
Calculate the bounding box in user coordinates for a string.
Expand Down Expand Up @@ -305,7 +307,7 @@ def render_string_hv(self, hv, seq, matrix, vec, font, fontsize,
vec[hv] += charspace
vec[hv] += self.render_char(
pdfminer.utils.translate_matrix(matrix, vec),
font, fontsize, scaling, rise, cid)
font, fontsize, scaling, rise, cid, ncs, graphicstate)
if cid == 32 and wordspace:
vec[hv] += wordspace
needcharspace = True
Expand All @@ -327,13 +329,12 @@ def __init__(self, pdffile):
self.device = DeviceLoader(res_mgr)
self.interpreter = ColoredInterpreter(res_mgr, self.device)
self.parser = pdfminer.pdfparser.PDFParser(pdffile)
self.doc = pdfminer.pdfparser.PDFDocument(caching=True)
self.doc = pdfminer.pdfdocument.PDFDocument(parser=self.parser, caching=True)
self.parser.set_document(self.doc)
self.doc.set_parser(self.parser)

def iter_pages(self):
"Iterate through all the pages in a document."
for page in self.doc.get_pages():
for page in pdfminer.pdfpage.PDFPage.create_pages(self.doc):
self.interpreter.process_page(page)
yield self.device.page

Expand All @@ -345,7 +346,7 @@ def get_page(self, num):
display order, not the numbering system used in the document.
"""
for i, page in enumerate(self.doc.get_pages()):
for i, page in enumerate(pdfminer.pdfpage.PDFPage.create_pages(self.doc)):
if i == num:
self.interpreter.process_page(page)
return self.device.page
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -24,7 +24,7 @@
'License :: OSI Approved :: MIT License',
],
keywords='pdf pdfminer extract mining images',
install_requires=['pdfminer3k', 'six'],
install_requires=['pdfminer.six', 'six'],
extras_require={
'PIL': ['Pillow'],
},
Expand Down

0 comments on commit e166d8c

Please sign in to comment.