Permalink
Browse files

Split pdfparser.py and pdfdocument.py.

  • Loading branch information...
1 parent 1467fc6 commit 2221163b944d2fd28b9a46a6b73d049a533d009e @euske committed Oct 10, 2013
Showing with 667 additions and 649 deletions.
  1. +657 −0 pdfminer/pdfdocument.py
  2. +3 −2 pdfminer/pdfinterp.py
  3. +3 −645 pdfminer/pdfparser.py
  4. +2 −1 tools/dumppdf.py
  5. +2 −1 tools/pdf2txt.py
View
657 pdfminer/pdfdocument.py
@@ -0,0 +1,657 @@
+#!/usr/bin/env python2
+import sys
+import re
+import struct
+try:
+ import hashlib as md5
+except ImportError:
+ import md5
+from psparser import PSEOF
+from psparser import literal_name
+from psparser import LIT, KWD, STRICT
+from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
+from pdftypes import PDFObjectNotFound, PDFStream
+from pdftypes import resolve1, decipher_all
+from pdftypes import int_value, float_value, num_value
+from pdftypes import str_value, list_value, dict_value, stream_value
+from pdfparser import PDFSyntaxError
+from pdfparser import PDFStreamParser
+from arcfour import Arcfour
+from utils import choplist, nunpack
+from utils import decode_text, ObjIdRange
+
+
+## Exceptions
+##
+class PDFNoValidXRef(PDFSyntaxError): pass
+class PDFNoOutlines(PDFException): pass
+class PDFDestinationNotFound(PDFException): pass
+class PDFEncryptionError(PDFException): pass
+class PDFPasswordIncorrect(PDFEncryptionError): pass
+
+# some predefined literals and keywords.
+LITERAL_OBJSTM = LIT('ObjStm')
+LITERAL_XREF = LIT('XRef')
+LITERAL_PAGE = LIT('Page')
+LITERAL_PAGES = LIT('Pages')
+LITERAL_CATALOG = LIT('Catalog')
+
+
+## XRefs
+##
+class PDFBaseXRef(object):
+
+ def get_trailer(self):
+ raise NotImplementedError
+
+ def get_objids(self):
+ return []
+
+ def get_pos(self, objid):
+ raise KeyError(objid)
+
+
+## PDFXRef
+##
+class PDFXRef(PDFBaseXRef):
+
+ def __init__(self):
+ self.offsets = {}
+ self.trailer = {}
+ return
+
+ def __repr__(self):
+ return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
+
+ def load(self, parser, debug=0):
+ while 1:
+ try:
+ (pos, line) = parser.nextline()
+ if not line.strip(): continue
+ except PSEOF:
+ raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
+ if not line:
+ raise PDFNoValidXRef('Premature eof: %r' % parser)
+ if line.startswith('trailer'):
+ parser.seek(pos)
+ break
+ f = line.strip().split(' ')
+ if len(f) != 2:
+ raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
+ try:
+ (start, nobjs) = map(long, f)
+ except ValueError:
+ raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
+ for objid in xrange(start, start+nobjs):
+ try:
+ (_, line) = parser.nextline()
+ except PSEOF:
+ raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
+ f = line.strip().split(' ')
+ if len(f) != 3:
+ raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
+ (pos, genno, use) = f
+ if use != 'n': continue
+ self.offsets[objid] = (int(genno), long(pos))
+ if 1 <= debug:
+ print >>sys.stderr, 'xref objects:', self.offsets
+ self.load_trailer(parser)
+ return
+
+ KEYWORD_TRAILER = KWD('trailer')
+ def load_trailer(self, parser):
+ try:
+ (_,kwd) = parser.nexttoken()
+ assert kwd is self.KEYWORD_TRAILER
+ (_,dic) = parser.nextobject()
+ except PSEOF:
+ x = parser.pop(1)
+ if not x:
+ raise PDFNoValidXRef('Unexpected EOF - file corrupted')
+ (_,dic) = x[0]
+ self.trailer.update(dict_value(dic))
+ return
+
+ PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
+ def load_fallback(self, parser, debug=0):
+ parser.seek(0)
+ while 1:
+ try:
+ (pos, line) = parser.nextline()
+ except PSEOF:
+ break
+ if line.startswith('trailer'):
+ parser.seek(pos)
+ self.load_trailer(parser)
+ if 1 <= debug:
+ print >>sys.stderr, 'trailer: %r' % self.get_trailer()
+ break
+ m = self.PDFOBJ_CUE.match(line)
+ if not m: continue
+ (objid, genno) = m.groups()
+ self.offsets[int(objid)] = (0, pos)
+ return
+
+ def get_trailer(self):
+ return self.trailer
+
+ def get_objids(self):
+ return self.offsets.iterkeys()
+
+ def get_pos(self, objid):
+ try:
+ (genno, pos) = self.offsets[objid]
+ except KeyError:
+ raise
+ return (None, pos)
+
+
+## PDFXRefStream
+##
+class PDFXRefStream(PDFBaseXRef):
+
+ def __init__(self):
+ self.data = None
+ self.entlen = None
+ self.fl1 = self.fl2 = self.fl3 = None
+ self.objid_ranges = []
+ return
+
+ def __repr__(self):
+ return '<PDFXRefStream: fields=%d,%d,%d>' % (self.fl1, self.fl2, self.fl3)
+
+ def load(self, parser, debug=0):
+ (_,objid) = parser.nexttoken() # ignored
+ (_,genno) = parser.nexttoken() # ignored
+ (_,kwd) = parser.nexttoken()
+ (_,stream) = parser.nextobject()
+ if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
+ raise PDFNoValidXRef('Invalid PDF stream spec.')
+ size = stream['Size']
+ index_array = stream.get('Index', (0,size))
+ if len(index_array) % 2 != 0:
+ raise PDFSyntaxError('Invalid index number')
+ self.objid_ranges.extend( ObjIdRange(start, nobjs)
+ for (start,nobjs) in choplist(2, index_array) )
+ (self.fl1, self.fl2, self.fl3) = stream['W']
+ self.data = stream.get_data()
+ self.entlen = self.fl1+self.fl2+self.fl3
+ self.trailer = stream.attrs
+ if 1 <= debug:
+ print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
+ (', '.join(map(repr, self.objid_ranges)),
+ self.fl1, self.fl2, self.fl3))
+ return
+
+ def get_trailer(self):
+ return self.trailer
+
+ def get_objids(self):
+ for objid_range in self.objid_ranges:
+ for x in xrange(objid_range.get_start_id(), objid_range.get_end_id()+1):
+ yield x
+ return
+
+ def get_pos(self, objid):
+ offset = 0
+ found = False
+ for objid_range in self.objid_ranges:
+ if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id():
+ offset += objid - objid_range.get_start_id()
+ found = True
+ break
+ else:
+ offset += objid_range.get_nobjs()
+ if not found: raise KeyError(objid)
+ i = self.entlen * offset
+ ent = self.data[i:i+self.entlen]
+ f1 = nunpack(ent[:self.fl1], 1)
+ if f1 == 1:
+ pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
+ genno = nunpack(ent[self.fl1+self.fl2:])
+ return (None, pos)
+ elif f1 == 2:
+ objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
+ index = nunpack(ent[self.fl1+self.fl2:])
+ return (objid, index)
+ # this is a free object
+ raise KeyError(objid)
+
+
+## PDFPage
+##
+class PDFPage(object):
+
+ """An object that holds the information about a page.
+
+ A PDFPage object is merely a convenience class that has a set
+ of keys and values, which describe the properties of a page
+ and point to its contents.
+
+ Attributes:
+ doc: a PDFDocument object.
+ pageid: any Python object that can uniquely identify the page.
+ attrs: a dictionary of page attributes.
+ contents: a list of PDFStream objects that represents the page content.
+ lastmod: the last modified time of the page.
+ resources: a list of resources used by the page.
+ mediabox: the physical size of the page.
+ cropbox: the crop rectangle of the page.
+ rotate: the page rotation (in degree).
+ annots: the page annotations.
+ beads: a chain that represents natural reading order.
+ """
+
+ def __init__(self, doc, pageid, attrs):
+ """Initialize a page object.
+
+ doc: a PDFDocument object.
+ pageid: any Python object that can uniquely identify the page.
+ attrs: a dictionary of page attributes.
+ """
+ self.doc = doc
+ self.pageid = pageid
+ self.attrs = dict_value(attrs)
+ self.lastmod = resolve1(self.attrs.get('LastModified'))
+ self.resources = resolve1(self.attrs['Resources'])
+ self.mediabox = resolve1(self.attrs['MediaBox'])
+ if 'CropBox' in self.attrs:
+ self.cropbox = resolve1(self.attrs['CropBox'])
+ else:
+ self.cropbox = self.mediabox
+ self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
+ self.annots = self.attrs.get('Annots')
+ self.beads = self.attrs.get('B')
+ if 'Contents' in self.attrs:
+ contents = resolve1(self.attrs['Contents'])
+ else:
+ contents = []
+ if not isinstance(contents, list):
+ contents = [ contents ]
+ self.contents = contents
+ return
+
+ def __repr__(self):
+ return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
+
+
+## PDFDocument
+##
+class PDFDocument(object):
+
+ """PDFDocument object represents a PDF document.
+
+ Since a PDF file can be very big, normally it is not loaded at
+ once. So PDF document has to cooperate with a PDF parser in order to
+ dynamically import the data as processing goes.
+
+ Typical usage:
+ doc = PDFDocument()
+ doc.set_parser(parser)
+ doc.initialize(password)
+ obj = doc.getobj(objid)
+
+ """
+
+ debug = 0
+
+ def __init__(self, caching=True):
+ self.caching = caching
+ self.xrefs = []
+ self.info = []
+ self.catalog = None
+ self.encryption = None
+ self.decipher = None
+ self._parser = None
+ self._cached_objs = {}
+ self._parsed_objs = {}
+ return
+
+ def set_parser(self, parser, fallback=True):
+ "Set the document to use a given PDFParser object."
+ if self._parser: return
+ self._parser = parser
+ # Retrieve the information of each header that was appended
+ # (maybe multiple times) at the end of the document.
+ try:
+ self.xrefs = self.read_xref(parser)
+ except PDFNoValidXRef:
+ fallback = True
+ if fallback:
+ self.xrefs.extend(self.read_xref(parser, fallback=True))
+ for xref in self.xrefs:
+ trailer = xref.get_trailer()
+ if not trailer: continue
+ # If there's an encryption info, remember it.
+ if 'Encrypt' in trailer:
+ #assert not self.encryption
+ self.encryption = (list_value(trailer['ID']),
+ dict_value(trailer['Encrypt']))
+ if 'Info' in trailer:
+ self.info.append(dict_value(trailer['Info']))
+ if 'Root' in trailer:
+ # Every PDF file must have exactly one /Root dictionary.
+ self.catalog = dict_value(trailer['Root'])
+ break
+ else:
+ raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
+ if self.catalog.get('Type') is not LITERAL_CATALOG:
+ if STRICT:
+ raise PDFSyntaxError('Catalog not found!')
+ return
+
+ # initialize(password='')
+ # Perform the initialization with a given password.
+ # This step is mandatory even if there's no password associated
+ # with the document.
+ PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
+ def initialize(self, password=''):
+ if not self.encryption:
+ self.is_printable = self.is_modifiable = self.is_extractable = True
+ return
+ (docid, param) = self.encryption
+ if literal_name(param.get('Filter')) != 'Standard':
+ raise PDFEncryptionError('Unknown filter: param=%r' % param)
+ V = int_value(param.get('V', 0))
+ if not (V == 1 or V == 2):
+ raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
+ length = int_value(param.get('Length', 40)) # Key length (bits)
+ O = str_value(param['O'])
+ R = int_value(param['R']) # Revision
+ if 5 <= R:
+ raise PDFEncryptionError('Unknown revision: %r' % R)
+ U = str_value(param['U'])
+ P = int_value(param['P'])
+ self.is_printable = bool(P & 4)
+ self.is_modifiable = bool(P & 8)
+ self.is_extractable = bool(P & 16)
+ # Algorithm 3.2
+ password = (password+self.PASSWORD_PADDING)[:32] # 1
+ hash = md5.md5(password) # 2
+ hash.update(O) # 3
+ hash.update(struct.pack('<l', P)) # 4
+ hash.update(docid[0]) # 5
+ if 4 <= R:
+ # 6
+ raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
+ if 3 <= R:
+ # 8
+ for _ in xrange(50):
+ hash = md5.md5(hash.digest()[:length/8])
+ key = hash.digest()[:length/8]
+ if R == 2:
+ # Algorithm 3.4
+ u1 = Arcfour(key).process(self.PASSWORD_PADDING)
+ elif R == 3:
+ # Algorithm 3.5
+ hash = md5.md5(self.PASSWORD_PADDING) # 2
+ hash.update(docid[0]) # 3
+ x = Arcfour(key).process(hash.digest()[:16]) # 4
+ for i in xrange(1,19+1):
+ k = ''.join( chr(ord(c) ^ i) for c in key )
+ x = Arcfour(k).process(x)
+ u1 = x+x # 32bytes total
+ if R == 2:
+ is_authenticated = (u1 == U)
+ else:
+ is_authenticated = (u1[:16] == U[:16])
+ if not is_authenticated:
+ raise PDFPasswordIncorrect
+ self.decrypt_key = key
+ self.decipher = self.decrypt_rc4 # XXX may be AES
+ return
+
+ def decrypt_rc4(self, objid, genno, data):
+ key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
+ hash = md5.md5(key)
+ key = hash.digest()[:min(len(key),16)]
+ return Arcfour(key).process(data)
+
+ KEYWORD_OBJ = KWD('obj')
+ # can raise PDFObjectNotFound
+ def getobj(self, objid):
+ if not self.xrefs:
+ raise PDFException('PDFDocument is not initialized')
+ if 2 <= self.debug:
+ print >>sys.stderr, 'getobj: objid=%r' % (objid)
+ if objid in self._cached_objs:
+ genno = 0
+ obj = self._cached_objs[objid]
+ else:
+ for xref in self.xrefs:
+ try:
+ (strmid, index) = xref.get_pos(objid)
+ break
+ except KeyError:
+ pass
+ else:
+ raise PDFObjectNotFound(objid)
+ if strmid:
+ stream = stream_value(self.getobj(strmid))
+ if stream.get('Type') is not LITERAL_OBJSTM:
+ if STRICT:
+ raise PDFSyntaxError('Not a stream object: %r' % stream)
+ try:
+ n = stream['N']
+ except KeyError:
+ if STRICT:
+ raise PDFSyntaxError('N is not defined: %r' % stream)
+ n = 0
+ if strmid in self._parsed_objs:
+ objs = self._parsed_objs[strmid]
+ else:
+ parser = PDFStreamParser(stream.get_data())
+ parser.set_document(self)
+ objs = []
+ try:
+ while 1:
+ (_,obj) = parser.nextobject()
+ objs.append(obj)
+ except PSEOF:
+ pass
+ if self.caching:
+ self._parsed_objs[strmid] = objs
+ genno = 0
+ i = n*2+index
+ try:
+ obj = objs[i]
+ except IndexError:
+ raise PDFObjectNotFound(objid)
+ if isinstance(obj, PDFStream):
+ obj.set_objid(objid, 0)
+ else:
+ self._parser.seek(index)
+ try:
+ (_,objid1) = self._parser.nexttoken() # objid
+ (_,genno) = self._parser.nexttoken() # genno
+ (_,kwd) = self._parser.nexttoken()
+ # #### hack around malformed pdf files
+ #assert objid1 == objid, (objid, objid1)
+ if objid1 != objid:
+ x = []
+ while kwd is not self.KEYWORD_OBJ:
+ (_,kwd) = self._parser.nexttoken()
+ x.append(kwd)
+ if x:
+ objid1 = x[-2]
+ genno = x[-1]
+ # #### end hack around malformed pdf files
+ if kwd is not self.KEYWORD_OBJ:
+ raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
+ (_,obj) = self._parser.nextobject()
+ if isinstance(obj, PDFStream):
+ obj.set_objid(objid, genno)
+ except PSEOF:
+ raise PDFObjectNotFound(objid)
+ if 2 <= self.debug:
+ print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
+ if self.caching:
+ self._cached_objs[objid] = obj
+ if self.decipher:
+ obj = decipher_all(self.decipher, objid, genno, obj)
+ return obj
+
+ INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
+ def get_pages(self):
+ if not self.xrefs:
+ raise PDFException('PDFDocument is not initialized')
+ def search(obj, parent):
+ if isinstance(obj, int):
+ objid = obj
+ tree = dict_value(self.getobj(objid)).copy()
+ else:
+ objid = obj.objid
+ tree = dict_value(obj).copy()
+ for (k,v) in parent.iteritems():
+ if k in self.INHERITABLE_ATTRS and k not in tree:
+ tree[k] = v
+ if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
+ if 1 <= self.debug:
+ print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
+ for c in list_value(tree['Kids']):
+ for x in search(c, tree):
+ yield x
+ elif tree.get('Type') is LITERAL_PAGE:
+ if 1 <= self.debug:
+ print >>sys.stderr, 'Page: %r' % tree
+ yield (objid, tree)
+ pages = False
+ if 'Pages' in self.catalog:
+ for (objid,tree) in search(self.catalog['Pages'], self.catalog):
+ yield PDFPage(self, objid, tree)
+ pages = True
+ if not pages:
+ # fallback when /Pages is missing.
+ for xref in self.xrefs:
+ for objid in xref.get_objids():
+ try:
+ obj = self.getobj(objid)
+ if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
+ yield PDFPage(self, objid, obj)
+ except PDFObjectNotFound:
+ pass
+ return
+
+ def get_outlines(self):
+ if 'Outlines' not in self.catalog:
+ raise PDFNoOutlines
+ def search(entry, level):
+ entry = dict_value(entry)
+ if 'Title' in entry:
+ if 'A' in entry or 'Dest' in entry:
+ title = decode_text(str_value(entry['Title']))
+ dest = entry.get('Dest')
+ action = entry.get('A')
+ se = entry.get('SE')
+ yield (level, title, dest, action, se)
+ if 'First' in entry and 'Last' in entry:
+ for x in search(entry['First'], level+1):
+ yield x
+ if 'Next' in entry:
+ for x in search(entry['Next'], level):
+ yield x
+ return
+ return search(self.catalog['Outlines'], 0)
+
+ def lookup_name(self, cat, key):
+ try:
+ names = dict_value(self.catalog['Names'])
+ except (PDFTypeError, KeyError):
+ raise KeyError((cat,key))
+ # may raise KeyError
+ d0 = dict_value(names[cat])
+ def lookup(d):
+ if 'Limits' in d:
+ (k1,k2) = list_value(d['Limits'])
+ if key < k1 or k2 < key: return None
+ if 'Names' in d:
+ objs = list_value(d['Names'])
+ names = dict(choplist(2, objs))
+ return names[key]
+ if 'Kids' in d:
+ for c in list_value(d['Kids']):
+ v = lookup(dict_value(c))
+ if v: return v
+ raise KeyError((cat,key))
+ return lookup(d0)
+
+ def get_dest(self, name):
+ try:
+ # PDF-1.2 or later
+ obj = self.lookup_name('Dests', name)
+ except KeyError:
+ # PDF-1.1 or prior
+ if 'Dests' not in self.catalog:
+ raise PDFDestinationNotFound(name)
+ d0 = dict_value(self.catalog['Dests'])
+ if name not in d0:
+ raise PDFDestinationNotFound(name)
+ obj = d0[name]
+ return obj
+
+ # find_xref
+ def find_xref(self, parser):
+ """Internal function used to locate the first XRef."""
+ # search the last xref table by scanning the file backwards.
+ prev = None
+ for line in parser.revreadlines():
+ line = line.strip()
+ if 2 <= self.debug:
+ print >>sys.stderr, 'find_xref: %r' % line
+ if line == 'startxref': break
+ if line:
+ prev = line
+ else:
+ raise PDFNoValidXRef('Unexpected EOF')
+ if 1 <= self.debug:
+ print >>sys.stderr, 'xref found: pos=%r' % prev
+ return long(prev)
+
+ # read xref table
+ def read_xref_from(self, parser, start, xrefs):
+ """Reads XRefs from the given location."""
+ parser.seek(start)
+ parser.reset()
+ try:
+ (pos, token) = parser.nexttoken()
+ except PSEOF:
+ raise PDFNoValidXRef('Unexpected EOF')
+ if 2 <= self.debug:
+ print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
+ if isinstance(token, int):
+ # XRefStream: PDF-1.5
+ parser.seek(pos)
+ parser.reset()
+ xref = PDFXRefStream()
+ xref.load(parser, debug=self.debug)
+ else:
+ if token is parser.KEYWORD_XREF:
+ parser.nextline()
+ xref = PDFXRef()
+ xref.load(parser, debug=self.debug)
+ xrefs.append(xref)
+ trailer = xref.get_trailer()
+ if 1 <= self.debug:
+ print >>sys.stderr, 'trailer: %r' % trailer
+ if 'XRefStm' in trailer:
+ pos = int_value(trailer['XRefStm'])
+ self.read_xref_from(parser, pos, xrefs)
+ if 'Prev' in trailer:
+ # find previous xref
+ pos = int_value(trailer['Prev'])
+ self.read_xref_from(parser, pos, xrefs)
+ return
+
+ # read xref tables and trailers
+ def read_xref(self, parser, fallback=False):
+ """Reads all the XRefs in the PDF file and returns them."""
+ xrefs = []
+ parser.fallback = fallback
+ if parser.fallback:
+ xref = PDFXRef()
+ xref.load_fallback(parser)
+ xrefs.append(xref)
+ else:
+ pos = self.find_xref(parser)
+ self.read_xref_from(parser, pos, xrefs)
+ return xrefs
View
5 pdfminer/pdfinterp.py
@@ -17,8 +17,9 @@
from pdffont import PDFFontError
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
from pdffont import PDFCIDFont
-from pdfparser import PDFDocument, PDFParser
-from pdfparser import PDFPasswordIncorrect
+from pdfparser import PDFParser
+from pdfdocument import PDFDocument
+from pdfdocument import PDFPasswordIncorrect
from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
View
648 pdfminer/pdfparser.py
@@ -1,597 +1,21 @@
#!/usr/bin/env python2
import sys
-import re
-import struct
-try:
- import hashlib as md5
-except ImportError:
- import md5
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSStackParser
from psparser import PSSyntaxError, PSEOF
-from psparser import literal_name
-from psparser import LIT, KWD, STRICT
-from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
-from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef
-from pdftypes import resolve1, decipher_all
+from psparser import KWD, STRICT
+from pdftypes import PDFException
+from pdftypes import PDFStream, PDFObjRef
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
-from arcfour import Arcfour
-from utils import choplist, nunpack
-from utils import decode_text, ObjIdRange
## Exceptions
##
class PDFSyntaxError(PDFException): pass
-class PDFNoValidXRef(PDFSyntaxError): pass
-class PDFNoOutlines(PDFException): pass
-class PDFDestinationNotFound(PDFException): pass
-class PDFEncryptionError(PDFException): pass
-class PDFPasswordIncorrect(PDFEncryptionError): pass
-
-# some predefined literals and keywords.
-LITERAL_OBJSTM = LIT('ObjStm')
-LITERAL_XREF = LIT('XRef')
-LITERAL_PAGE = LIT('Page')
-LITERAL_PAGES = LIT('Pages')
-LITERAL_CATALOG = LIT('Catalog')
-
-
-## XRefs
-##
-class PDFBaseXRef(object):
-
- def get_trailer(self):
- raise NotImplementedError
-
- def get_objids(self):
- return []
-
- def get_pos(self, objid):
- raise KeyError(objid)
-
-
-## PDFXRef
-##
-class PDFXRef(PDFBaseXRef):
-
- def __init__(self):
- self.offsets = {}
- self.trailer = {}
- return
-
- def __repr__(self):
- return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
-
- def load(self, parser, debug=0):
- while 1:
- try:
- (pos, line) = parser.nextline()
- if not line.strip(): continue
- except PSEOF:
- raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
- if not line:
- raise PDFNoValidXRef('Premature eof: %r' % parser)
- if line.startswith('trailer'):
- parser.seek(pos)
- break
- f = line.strip().split(' ')
- if len(f) != 2:
- raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
- try:
- (start, nobjs) = map(long, f)
- except ValueError:
- raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
- for objid in xrange(start, start+nobjs):
- try:
- (_, line) = parser.nextline()
- except PSEOF:
- raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
- f = line.strip().split(' ')
- if len(f) != 3:
- raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
- (pos, genno, use) = f
- if use != 'n': continue
- self.offsets[objid] = (int(genno), long(pos))
- if 1 <= debug:
- print >>sys.stderr, 'xref objects:', self.offsets
- self.load_trailer(parser)
- return
-
- KEYWORD_TRAILER = KWD('trailer')
- def load_trailer(self, parser):
- try:
- (_,kwd) = parser.nexttoken()
- assert kwd is self.KEYWORD_TRAILER
- (_,dic) = parser.nextobject()
- except PSEOF:
- x = parser.pop(1)
- if not x:
- raise PDFNoValidXRef('Unexpected EOF - file corrupted')
- (_,dic) = x[0]
- self.trailer.update(dict_value(dic))
- return
-
- PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
- def load_fallback(self, parser, debug=0):
- parser.seek(0)
- while 1:
- try:
- (pos, line) = parser.nextline()
- except PSEOF:
- break
- if line.startswith('trailer'):
- parser.seek(pos)
- self.load_trailer(parser)
- if 1 <= debug:
- print >>sys.stderr, 'trailer: %r' % self.get_trailer()
- break
- m = self.PDFOBJ_CUE.match(line)
- if not m: continue
- (objid, genno) = m.groups()
- self.offsets[int(objid)] = (0, pos)
- return
-
- def get_trailer(self):
- return self.trailer
-
- def get_objids(self):
- return self.offsets.iterkeys()
-
- def get_pos(self, objid):
- try:
- (genno, pos) = self.offsets[objid]
- except KeyError:
- raise
- return (None, pos)
-
-
-## PDFXRefStream
-##
-class PDFXRefStream(PDFBaseXRef):
-
- def __init__(self):
- self.data = None
- self.entlen = None
- self.fl1 = self.fl2 = self.fl3 = None
- self.objid_ranges = []
- return
-
- def __repr__(self):
- return '<PDFXRefStream: fields=%d,%d,%d>' % (self.fl1, self.fl2, self.fl3)
-
- def load(self, parser, debug=0):
- (_,objid) = parser.nexttoken() # ignored
- (_,genno) = parser.nexttoken() # ignored
- (_,kwd) = parser.nexttoken()
- (_,stream) = parser.nextobject()
- if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
- raise PDFNoValidXRef('Invalid PDF stream spec.')
- size = stream['Size']
- index_array = stream.get('Index', (0,size))
- if len(index_array) % 2 != 0:
- raise PDFSyntaxError('Invalid index number')
- self.objid_ranges.extend( ObjIdRange(start, nobjs)
- for (start,nobjs) in choplist(2, index_array) )
- (self.fl1, self.fl2, self.fl3) = stream['W']
- self.data = stream.get_data()
- self.entlen = self.fl1+self.fl2+self.fl3
- self.trailer = stream.attrs
- if 1 <= debug:
- print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
- (', '.join(map(repr, self.objid_ranges)),
- self.fl1, self.fl2, self.fl3))
- return
-
- def get_trailer(self):
- return self.trailer
-
- def get_objids(self):
- for objid_range in self.objid_ranges:
- for x in xrange(objid_range.get_start_id(), objid_range.get_end_id()+1):
- yield x
- return
-
- def get_pos(self, objid):
- offset = 0
- found = False
- for objid_range in self.objid_ranges:
- if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id():
- offset += objid - objid_range.get_start_id()
- found = True
- break
- else:
- offset += objid_range.get_nobjs()
- if not found: raise KeyError(objid)
- i = self.entlen * offset
- ent = self.data[i:i+self.entlen]
- f1 = nunpack(ent[:self.fl1], 1)
- if f1 == 1:
- pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
- genno = nunpack(ent[self.fl1+self.fl2:])
- return (None, pos)
- elif f1 == 2:
- objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
- index = nunpack(ent[self.fl1+self.fl2:])
- return (objid, index)
- # this is a free object
- raise KeyError(objid)
-
-
-## PDFPage
-##
-class PDFPage(object):
-
- """An object that holds the information about a page.
-
- A PDFPage object is merely a convenience class that has a set
- of keys and values, which describe the properties of a page
- and point to its contents.
-
- Attributes:
- doc: a PDFDocument object.
- pageid: any Python object that can uniquely identify the page.
- attrs: a dictionary of page attributes.
- contents: a list of PDFStream objects that represents the page content.
- lastmod: the last modified time of the page.
- resources: a list of resources used by the page.
- mediabox: the physical size of the page.
- cropbox: the crop rectangle of the page.
- rotate: the page rotation (in degree).
- annots: the page annotations.
- beads: a chain that represents natural reading order.
- """
-
- def __init__(self, doc, pageid, attrs):
- """Initialize a page object.
-
- doc: a PDFDocument object.
- pageid: any Python object that can uniquely identify the page.
- attrs: a dictionary of page attributes.
- """
- self.doc = doc
- self.pageid = pageid
- self.attrs = dict_value(attrs)
- self.lastmod = resolve1(self.attrs.get('LastModified'))
- self.resources = resolve1(self.attrs['Resources'])
- self.mediabox = resolve1(self.attrs['MediaBox'])
- if 'CropBox' in self.attrs:
- self.cropbox = resolve1(self.attrs['CropBox'])
- else:
- self.cropbox = self.mediabox
- self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
- self.annots = self.attrs.get('Annots')
- self.beads = self.attrs.get('B')
- if 'Contents' in self.attrs:
- contents = resolve1(self.attrs['Contents'])
- else:
- contents = []
- if not isinstance(contents, list):
- contents = [ contents ]
- self.contents = contents
- return
-
- def __repr__(self):
- return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
-
-
-## PDFDocument
-##
-class PDFDocument(object):
-
- """PDFDocument object represents a PDF document.
-
- Since a PDF file can be very big, normally it is not loaded at
- once. So PDF document has to cooperate with a PDF parser in order to
- dynamically import the data as processing goes.
-
- Typical usage:
- doc = PDFDocument()
- doc.set_parser(parser)
- doc.initialize(password)
- obj = doc.getobj(objid)
-
- """
-
- debug = 0
-
- def __init__(self, caching=True):
- self.caching = caching
- self.xrefs = []
- self.info = []
- self.catalog = None
- self.encryption = None
- self.decipher = None
- self._parser = None
- self._cached_objs = {}
- self._parsed_objs = {}
- return
-
- def set_parser(self, parser, fallback=True):
- "Set the document to use a given PDFParser object."
- if self._parser: return
- self._parser = parser
- # Retrieve the information of each header that was appended
- # (maybe multiple times) at the end of the document.
- try:
- self.xrefs = parser.read_xref()
- except PDFNoValidXRef:
- fallback = True
- if fallback:
- self.xrefs.extend(parser.read_xref(fallback=True))
- for xref in self.xrefs:
- trailer = xref.get_trailer()
- if not trailer: continue
- # If there's an encryption info, remember it.
- if 'Encrypt' in trailer:
- #assert not self.encryption
- self.encryption = (list_value(trailer['ID']),
- dict_value(trailer['Encrypt']))
- if 'Info' in trailer:
- self.info.append(dict_value(trailer['Info']))
- if 'Root' in trailer:
- # Every PDF file must have exactly one /Root dictionary.
- self.catalog = dict_value(trailer['Root'])
- break
- else:
- raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
- if self.catalog.get('Type') is not LITERAL_CATALOG:
- if STRICT:
- raise PDFSyntaxError('Catalog not found!')
- return
-
- # initialize(password='')
- # Perform the initialization with a given password.
- # This step is mandatory even if there's no password associated
- # with the document.
- PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
- def initialize(self, password=''):
- if not self.encryption:
- self.is_printable = self.is_modifiable = self.is_extractable = True
- return
- (docid, param) = self.encryption
- if literal_name(param.get('Filter')) != 'Standard':
- raise PDFEncryptionError('Unknown filter: param=%r' % param)
- V = int_value(param.get('V', 0))
- if not (V == 1 or V == 2):
- raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
- length = int_value(param.get('Length', 40)) # Key length (bits)
- O = str_value(param['O'])
- R = int_value(param['R']) # Revision
- if 5 <= R:
- raise PDFEncryptionError('Unknown revision: %r' % R)
- U = str_value(param['U'])
- P = int_value(param['P'])
- self.is_printable = bool(P & 4)
- self.is_modifiable = bool(P & 8)
- self.is_extractable = bool(P & 16)
- # Algorithm 3.2
- password = (password+self.PASSWORD_PADDING)[:32] # 1
- hash = md5.md5(password) # 2
- hash.update(O) # 3
- hash.update(struct.pack('<l', P)) # 4
- hash.update(docid[0]) # 5
- if 4 <= R:
- # 6
- raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
- if 3 <= R:
- # 8
- for _ in xrange(50):
- hash = md5.md5(hash.digest()[:length/8])
- key = hash.digest()[:length/8]
- if R == 2:
- # Algorithm 3.4
- u1 = Arcfour(key).process(self.PASSWORD_PADDING)
- elif R == 3:
- # Algorithm 3.5
- hash = md5.md5(self.PASSWORD_PADDING) # 2
- hash.update(docid[0]) # 3
- x = Arcfour(key).process(hash.digest()[:16]) # 4
- for i in xrange(1,19+1):
- k = ''.join( chr(ord(c) ^ i) for c in key )
- x = Arcfour(k).process(x)
- u1 = x+x # 32bytes total
- if R == 2:
- is_authenticated = (u1 == U)
- else:
- is_authenticated = (u1[:16] == U[:16])
- if not is_authenticated:
- raise PDFPasswordIncorrect
- self.decrypt_key = key
- self.decipher = self.decrypt_rc4 # XXX may be AES
- return
-
- def decrypt_rc4(self, objid, genno, data):
- key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
- hash = md5.md5(key)
- key = hash.digest()[:min(len(key),16)]
- return Arcfour(key).process(data)
-
- KEYWORD_OBJ = KWD('obj')
- # can raise PDFObjectNotFound
- def getobj(self, objid):
- if not self.xrefs:
- raise PDFException('PDFDocument is not initialized')
- if 2 <= self.debug:
- print >>sys.stderr, 'getobj: objid=%r' % (objid)
- if objid in self._cached_objs:
- genno = 0
- obj = self._cached_objs[objid]
- else:
- for xref in self.xrefs:
- try:
- (strmid, index) = xref.get_pos(objid)
- break
- except KeyError:
- pass
- else:
- raise PDFObjectNotFound(objid)
- if strmid:
- stream = stream_value(self.getobj(strmid))
- if stream.get('Type') is not LITERAL_OBJSTM:
- if STRICT:
- raise PDFSyntaxError('Not a stream object: %r' % stream)
- try:
- n = stream['N']
- except KeyError:
- if STRICT:
- raise PDFSyntaxError('N is not defined: %r' % stream)
- n = 0
- if strmid in self._parsed_objs:
- objs = self._parsed_objs[strmid]
- else:
- parser = PDFStreamParser(stream.get_data())
- parser.set_document(self)
- objs = []
- try:
- while 1:
- (_,obj) = parser.nextobject()
- objs.append(obj)
- except PSEOF:
- pass
- if self.caching:
- self._parsed_objs[strmid] = objs
- genno = 0
- i = n*2+index
- try:
- obj = objs[i]
- except IndexError:
- raise PDFObjectNotFound(objid)
- if isinstance(obj, PDFStream):
- obj.set_objid(objid, 0)
- else:
- self._parser.seek(index)
- try:
- (_,objid1) = self._parser.nexttoken() # objid
- (_,genno) = self._parser.nexttoken() # genno
- (_,kwd) = self._parser.nexttoken()
- # #### hack around malformed pdf files
- #assert objid1 == objid, (objid, objid1)
- if objid1 != objid:
- x = []
- while kwd is not self.KEYWORD_OBJ:
- (_,kwd) = self._parser.nexttoken()
- x.append(kwd)
- if x:
- objid1 = x[-2]
- genno = x[-1]
- # #### end hack around malformed pdf files
- if kwd is not self.KEYWORD_OBJ:
- raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
- (_,obj) = self._parser.nextobject()
- if isinstance(obj, PDFStream):
- obj.set_objid(objid, genno)
- except PSEOF:
- raise PDFObjectNotFound(objid)
- if 2 <= self.debug:
- print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
- if self.caching:
- self._cached_objs[objid] = obj
- if self.decipher:
- obj = decipher_all(self.decipher, objid, genno, obj)
- return obj
-
- INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
- def get_pages(self):
- if not self.xrefs:
- raise PDFException('PDFDocument is not initialized')
- def search(obj, parent):
- if isinstance(obj, int):
- objid = obj
- tree = dict_value(self.getobj(objid)).copy()
- else:
- objid = obj.objid
- tree = dict_value(obj).copy()
- for (k,v) in parent.iteritems():
- if k in self.INHERITABLE_ATTRS and k not in tree:
- tree[k] = v
- if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
- if 1 <= self.debug:
- print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
- for c in list_value(tree['Kids']):
- for x in search(c, tree):
- yield x
- elif tree.get('Type') is LITERAL_PAGE:
- if 1 <= self.debug:
- print >>sys.stderr, 'Page: %r' % tree
- yield (objid, tree)
- pages = False
- if 'Pages' in self.catalog:
- for (objid,tree) in search(self.catalog['Pages'], self.catalog):
- yield PDFPage(self, objid, tree)
- pages = True
- if not pages:
- # fallback when /Pages is missing.
- for xref in self.xrefs:
- for objid in xref.get_objids():
- try:
- obj = self.getobj(objid)
- if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
- yield PDFPage(self, objid, obj)
- except PDFObjectNotFound:
- pass
- return
-
- def get_outlines(self):
- if 'Outlines' not in self.catalog:
- raise PDFNoOutlines
- def search(entry, level):
- entry = dict_value(entry)
- if 'Title' in entry:
- if 'A' in entry or 'Dest' in entry:
- title = decode_text(str_value(entry['Title']))
- dest = entry.get('Dest')
- action = entry.get('A')
- se = entry.get('SE')
- yield (level, title, dest, action, se)
- if 'First' in entry and 'Last' in entry:
- for x in search(entry['First'], level+1):
- yield x
- if 'Next' in entry:
- for x in search(entry['Next'], level):
- yield x
- return
- return search(self.catalog['Outlines'], 0)
-
- def lookup_name(self, cat, key):
- try:
- names = dict_value(self.catalog['Names'])
- except (PDFTypeError, KeyError):
- raise KeyError((cat,key))
- # may raise KeyError
- d0 = dict_value(names[cat])
- def lookup(d):
- if 'Limits' in d:
- (k1,k2) = list_value(d['Limits'])
- if key < k1 or k2 < key: return None
- if 'Names' in d:
- objs = list_value(d['Names'])
- names = dict(choplist(2, objs))
- return names[key]
- if 'Kids' in d:
- for c in list_value(d['Kids']):
- v = lookup(dict_value(c))
- if v: return v
- raise KeyError((cat,key))
- return lookup(d0)
-
- def get_dest(self, name):
- try:
- # PDF-1.2 or later
- obj = self.lookup_name('Dests', name)
- except KeyError:
- # PDF-1.1 or prior
- if 'Dests' not in self.catalog:
- raise PDFDestinationNotFound(name)
- d0 = dict_value(self.catalog['Dests'])
- if name not in d0:
- raise PDFDestinationNotFound(name)
- obj = d0[name]
- return obj
## PDFParser
@@ -704,72 +128,6 @@ def do_keyword(self, pos, token):
return
- def find_xref(self):
- """Internal function used to locate the first XRef."""
- # search the last xref table by scanning the file backwards.
- prev = None
- for line in self.revreadlines():
- line = line.strip()
- if 2 <= self.debug:
- print >>sys.stderr, 'find_xref: %r' % line
- if line == 'startxref': break
- if line:
- prev = line
- else:
- raise PDFNoValidXRef('Unexpected EOF')
- if 1 <= self.debug:
- print >>sys.stderr, 'xref found: pos=%r' % prev
- return long(prev)
-
- # read xref table
- def read_xref_from(self, start, xrefs):
- """Reads XRefs from the given location."""
- self.seek(start)
- self.reset()
- try:
- (pos, token) = self.nexttoken()
- except PSEOF:
- raise PDFNoValidXRef('Unexpected EOF')
- if 2 <= self.debug:
- print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
- if isinstance(token, int):
- # XRefStream: PDF-1.5
- self.seek(pos)
- self.reset()
- xref = PDFXRefStream()
- xref.load(self, debug=self.debug)
- else:
- if token is self.KEYWORD_XREF:
- self.nextline()
- xref = PDFXRef()
- xref.load(self, debug=self.debug)
- xrefs.append(xref)
- trailer = xref.get_trailer()
- if 1 <= self.debug:
- print >>sys.stderr, 'trailer: %r' % trailer
- if 'XRefStm' in trailer:
- pos = int_value(trailer['XRefStm'])
- self.read_xref_from(pos, xrefs)
- if 'Prev' in trailer:
- # find previous xref
- pos = int_value(trailer['Prev'])
- self.read_xref_from(pos, xrefs)
- return
-
- # read xref tables and trailers
- def read_xref(self, fallback=False):
- """Reads all the XRefs in the PDF file and returns them."""
- xrefs = []
- self.fallback = fallback
- if self.fallback:
- xref = PDFXRef()
- xref.load_fallback(self)
- xrefs.append(xref)
- else:
- pos = self.find_xref()
- self.read_xref_from(pos, xrefs)
- return xrefs
-
## PDFStreamParser
##
View
3 tools/dumppdf.py
@@ -8,7 +8,8 @@
#
import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral
-from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
View
3 tools/pdf2txt.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python2
import sys
-from pdfminer.pdfparser import PDFDocument, PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter

0 comments on commit 2221163

Please sign in to comment.