Skip to content
Browse files

API change: process_pdf -> PDFPage.get_pages

  • Loading branch information...
1 parent 8a70a9f commit d3730a29ecf238f6ca2c8f72c53755db69b75654 @euske committed Oct 22, 2013
Showing with 33 additions and 31 deletions.
  1. +0 −27 pdfminer/pdfinterp.py
  2. +26 −1 pdfminer/pdfpage.py
  3. +7 −3 tools/pdf2txt.py
View
27 pdfminer/pdfinterp.py
@@ -24,7 +24,6 @@
from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfcolor import LITERAL_DEVICE_CMYK
-from pdfpage import PDFPage
from utils import choplist
from utils import mult_matrix, MATRIX_IDENTITY
@@ -804,29 +803,3 @@ def execute(self, streams):
else:
self.push(obj)
return
-
-
-## process_pdf
-##
-class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
-
-def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
- caching=True, check_extractable=True):
- # Create a PDF parser object associated with the file object.
- parser = PDFParser(fp)
- # Create a PDF document object that stores the document structure.
- doc = PDFDocument(parser, caching=caching)
- # Supply the document password for initialization.
- # (If no password is set, give an empty string.)
- doc.initialize(password)
- # Check if the document allows text extraction. If not, abort.
- if check_extractable and not doc.is_extractable:
- raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
- # Create a PDF interpreter object.
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- # Process each page contained in the document.
- for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
- if pagenos and (pageno not in pagenos): continue
- interpreter.process_page(page)
- if maxpages and maxpages <= pageno+1: break
- return
View
27 pdfminer/pdfpage.py
@@ -5,7 +5,9 @@
from pdftypes import resolve1
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
-
+from pdfparser import PDFParser
+from pdfdocument import PDFDocument
+from pdfdocument import PDFEncryptionError
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
@@ -107,3 +109,26 @@ def search(obj, parent):
except PDFObjectNotFound:
pass
return
+
+ class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
+
+ @classmethod
+ def get_pages(klass, fp,
+ pagenos=None, maxpages=0, password='',
+ caching=True, check_extractable=True):
+ # Create a PDF parser object associated with the file object.
+ parser = PDFParser(fp)
+ # Create a PDF document object that stores the document structure.
+ doc = PDFDocument(parser, caching=caching)
+ # Supply the document password for initialization.
+ # (If no password is set, give an empty string.)
+ doc.initialize(password)
+ # Check if the document allows text extraction. If not, abort.
+ if check_extractable and not doc.is_extractable:
+ raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
+ # Process each page contained in the document.
+ for (pageno,page) in enumerate(klass.create_pages(doc)):
+ if pagenos and (pageno not in pagenos): continue
+ yield page
+ if maxpages and maxpages <= pageno+1: break
+ return
View
10 tools/pdf2txt.py
@@ -2,8 +2,9 @@
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
+from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
@@ -96,8 +97,11 @@ def usage():
return usage()
for fname in args:
fp = file(fname, 'rb')
- process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
- caching=caching, check_extractable=True)
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
+ for page in PDFPage.get_pages(fp, pagenos,
+ maxpages=maxpages, password=password,
+ caching=caching, check_extractable=True):
+ interpreter.process_page(page)
fp.close()
device.close()
outfp.close()

0 comments on commit d3730a2

Please sign in to comment.
Something went wrong with that request. Please try again.