-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfmaze.py
192 lines (161 loc) · 6.64 KB
/
pdfmaze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import io
import itertools
from numbers import Real
from os import PathLike
from typing import Iterable, List, Optional, Union, Iterable
import pdfminer
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTText, LTAnno, LTChar, LTTextLine, LTTextBoxHorizontal, LTFigure
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser
from wordmaze.wordmaze import Origin
from wordmaze.wordmaze import Page as WMPage
from wordmaze.wordmaze import Shape, TextBox, WordMaze
class PDFMaze:
def _parse_objs(
self,
lt_objs,
confidence: Optional[Real] = None,
split_words: bool = False,
key_split_chars: Iterable[str] = (' ',)
) -> Iterable[TextBox]:
return itertools.chain.from_iterable(
self._parse_obj(
obj,
confidence=confidence,
split_words=split_words,
key_split_chars = tuple(key_split_chars)
)
for obj in lt_objs
)
def _parse_obj(
self,
obj,
confidence: Optional[Real] = None,
split_words: bool = False,
key_split_chars: Iterable[str] = (' ',)
) -> List[TextBox]:
textboxes = []
if not split_words:
if isinstance(obj, LTTextLine):
x1, y1, x2, y2 = obj.bbox
textbox = TextBox(
x1=x1,
x2=x2,
y1=y1,
y2=y2,
text=obj.get_text().strip(),
confidence=confidence
)
textboxes.append(textbox)
# if it's a textbox or a container, also recurse
if isinstance(
obj,
(LTTextBoxHorizontal, LTFigure)
):
other_textboxes = self._parse_objs(obj._objs, confidence=confidence)
textboxes.extend(other_textboxes)
else:
x1, y1, x2, y2, text, previous_char = -1, -1, -1, -1, '', None
if isinstance(obj, LTText):
for line in obj:
for char in line:
# If the char is a line-break or other symbol chosen by
# the user, the word is complete
if isinstance(char, LTAnno) or char.get_text() in key_split_chars:
if x1 != -1:
# If the char is a line-break, get the coordinates
# of the previous char
if not isinstance(char, LTAnno) or not previous_char:
x2, y2, = char.bbox[2], char.bbox[3]
else:
x2, y2, = previous_char.bbox[2], previous_char.bbox[3]
textbox = TextBox(
x1=x1,
x2=x2,
y1=y1,
y2=y2,
text=text,
confidence=confidence
)
textboxes.append(textbox)
x1, y1, x2, y2, text = -1, -1, -1, -1, ''
elif isinstance(char, LTChar):
text += char.get_text()
if x1 == -1:
x1, y1, = char.bbox[0], char.bbox[1]
previous_char = char
# If the last symbol in the PDF was neither other symbol chosen
# by the user nor a line-break, add the last word to the word_map
if x1 != -1:
x2, y2, = char.bbox[2], char.bbox[3]
textbox = TextBox(
x1=x1,
x2=x2,
y1=y1,
y2=y2,
text=text,
confidence=confidence
)
textboxes.append(textbox)
return textboxes
def parse_pdf(
self,
source: Union[str, PathLike, bytes],
origin: Origin = Origin.TOP_LEFT,
confidence: Optional[Real] = None,
split_words: bool = False,
key_split_chars: Iterable[str] = (' ',)
) -> WordMaze:
if isinstance(source, bytes):
# Use PDF bytes.
fp = io.BytesIO(source)
else:
# Open a PDF file.
fp = open(source, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
wm = WordMaze()
# loop over all pages in the document and get the page index number
for page in PDFPage.get_pages(fp):
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
# extract text from this object
textboxes = self._parse_objs(
layout._objs,
confidence=confidence,
split_words=split_words,
key_split_chars = tuple(key_split_chars)
)
wm_page = WMPage(
shape=self.page_shape(page),
origin=Origin.BOTTOM_LEFT, # default origin from pdfminer
entries=textboxes
).rebase(origin=origin)
wm.append(wm_page)
return wm
@staticmethod
def page_shape(page: PDFPage) -> Shape:
_, _, width, height = page.mediabox
return Shape(width=width, height=height)