You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository has been archived by the owner on Apr 15, 2024. It is now read-only.
Hi.
I get an error when process page in some PDF files.
Code:
fp = open(filename, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
#device = PDFDevice(rsrcmgr)
# Create a PDF page aggregator object.
# Set parameters for analysis.
laparams = LAParams()
laparams.all_texts = True
laparams.dectect_vertical = True
laparams.word_margin = 0.06
# Modificado para PERAN
laparams.line_margin = 0.5
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
ipage = 0
layout_dict = None
try:
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
ipage += 1
if page_num == ipage:
offset = (0, 0)
# obtenemos desplazamiento del cropbox
# (las coordenadas que guardamos son del cropbox)
if page.mediabox != page.cropbox:
offset = (page.mediabox[0]-page.cropbox[0],
page.cropbox[3]-page.mediabox[3])
pagebox = [0, 0, page.cropbox[2]-page.cropbox[0], page.cropbox[3]-page.cropbox[1]]
interpreter.process_page(page)
layout = device.get_result()
layout_dict = extract_layout_dict(layout, overlap_pct, include_textline, include_char, offset, pagebox)
# analyze_overlap(layout_dict, overlap_pct=70)
if correct_images:
correct_blocimages(layout_dict)
break
except Exception, e:
print "Error PDFMiner: %s" % format(e)
pass
fp.close()
[CAP191211012.PDF](https://github.com/euske/pdfminer/files/3961030/CAP191211012.PDF)
Output:
File "/var/www/worker_ocr/layout_analyzer.py", line 1265, in analyzer
interpreter.process_page(page)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 833, in process_page
self.device.end_page(page)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/converter.py", line 35, in end_page
self.cur_item.analyze(self.laparams)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 646, in analyze
obj.analyze(laparams)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 686, in analyze
LTLayoutContainer.analyze(self, laparams)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 653, in analyze
textboxes = list(self.group_textlines(laparams, textlines))
File "/usr/local/lib/python2.7/dist-packages/pdfminer/layout.py", line 551, in group_textlines
plane.extend(lines)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 283, in extend
self.add(obj)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 288, in add
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 275, in _getrange
for y in drange(y0, y1, self.gridsize):
File "/usr/local/lib/python2.7/dist-packages/pdfminer/utils.py", line 121, in drange
assert v0 < v1
AssertionError
The text was updated successfully, but these errors were encountered:
Sign up for freeto subscribe to this conversation on GitHub.
Already have an account?
Sign in.
Hi.
I get an error when process page in some PDF files.
Code:
Output:
The text was updated successfully, but these errors were encountered: