In [1]:
# pip install pdfminer3k

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.psparser import PSLiteral
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdftypes import PDFObjRef
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.converter import PDFPageAggregator

from collections import defaultdict, namedtuple

In [2]:
fields = []
text = {}
TextBlock= namedtuple("TextBlock", ["x", "y", "text"])

In [3]:
# Create a PDF parser object associated with the file object.
parser = None
with open('1-scala-getting-started-slides-annotated.pdf','rb') as pdf_obj:
    parser = PDFParser(pdf_obj)
print(type(parser))
parser.data

<class 'pdfminer.pdfparser.PDFParser'>


'%PDF-1.6\r%âãÏÓ\r\n486 0 obj\r<</Linearized 1/L 124830/O 488/E 52615/N 9/T 124307/H [ 503 407]>>\rendobj\r              \r\n505 0 obj\r<</DecodeParms<</Columns 4/Predictor 12>>/Filter/FlateDecode/ID[<875691CE9C880F4C88DD99F35915E9DF><2705B426EDB6974E9A252D2349863BD9>]/Index[486 36]/Info 485 0 R/Length 93/Prev 124308/Root 487 0 R/Size 522/Type/XRef/W[1 2 1]>>stream\r\nhÞbbd\x10``b`®\x03\x12\x8c\x8c@\x82©\x11D|\x01\x12l, \x89\x0c\x10W\x16Dì\x02)Q\x06\x12\\Q@\x82\x9b\x1bH\x08\x1a\x03\t\x06S Ár\x1eH¤\'\x83XU uOA\x84\x1c\x90\x08îc`bd,\x03©c`¤\x9cøÏÀû\t À\x00P|\x0bB\r\nendstream\rendobj\rstartxref\r\n0\r\n%%EOF\r\n         \r\n521 0 obj\r<</C 337/Filter/FlateDecode/I 359/Length 306/O 298/S 207/V 314>>stream\r\nhÞb```f``úÁÀÂÀÀ"Á À\x80\x00\x02@16 æxÄÀ\xa0ÀpäAp\tK#óM¦$\x8exµf\x06\x06F£Î\x97½w\x0f@\x94rD\x08»*¶+(&\x04\x06Jf$qF\x961\x89¸Ý\x91(âäð\x10\x91I\nãbu\rpw\x8bm\x12SÑ\x10\x15ud\x93-ã\x88\x98° $\x94E¢ô\x86b\x92Sc`HåÇé\t\xad\r\x8c\x16\x1d\x1d\x0c\x1c\x1d\x1d\r\x0c\x0c\x12\x1d@J\x10H°\x84\x0

In [4]:
# Create a PDF document object that stores the document structure.
doc = PDFDocument()

In [5]:
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)



In [6]:
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize('')

In [7]:
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
    raise PDFTextExtractionNotAllowed

In [8]:
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

In [9]:
# Set parameters for analysis.
laparams = LAParams()

In [10]:
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

In [11]:
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)

In [12]:
def _build_annotations(page):
    for annot in page.annots.resolve():
        print(annot)
        if isinstance( annot, PDFObjRef ):
            annot= annot.resolve()
            print(annot)
            assert annot['Type'].name == "Annot", repr(annot)
            if annot['Subtype'].name == "Widget":
                if annot['FT'].name == "Btn":
                    assert annot['T'] not in fields
                    fields[ annot['T'] ] = annot['V'].name
                elif annot['FT'].name == "Tx":
                    assert annot['T'] not in fields
                    fields[ annot['T'] ] = annot['V']
                elif annot['FT'].name == "Ch":
                    assert annot['T'] not in fields
                    fields[ annot['T'] ] = annot['V']
                    # Alternative choices in annot['Opt'] )
                else:
                    raise Exception( "Unknown Widget" )
        else:
            raise Exception( "Unknown Annotation" )

In [13]:
def _build_annotations(pagenum, page):
    for annot in page.annots.resolve():
        if isinstance( annot, PDFObjRef ):
            annot= annot.resolve()
            #for idx, a in enumerate(annot):
                #print('pagenum: {}, idx: {}, annotation: {}-{}'.format(pagenum, idx, a, annot[a]))
                #if annot['Subtype'] == '/Popup':
                    #print(annot['Subtype'])
                #else:
            fields.append({'pagenum':pagenum, 'subtype': str(annot['Subtype']).replace('/','')})
        else:
            raise Exception( "Unknown Annotation" )

In [14]:
def _get_text(device):
    text= []
    layout = device.get_result()
    for obj in layout:
        if isinstance( obj, LTTextBoxHorizontal ):
            if obj.get_text().strip():
                text.append( TextBlock(obj.x0, obj.y1, obj.get_text().strip()) )
    text.sort(key=lambda row: (-row.y, row.x) )
    return text

In [15]:
def is_recognized():
    """Check for Copyright as well as Revision information on each page."""
    bottom_page_1 = text[1][-3:]
    bottom_page_2 = text[2][-3:]
    pg1_rev= "Rev 2011.01.17" == bottom_page_1[2].text
    pg2_rev= "Rev 2011.01.17" == bottom_page_2[0].text
    return pg1_rev and pg2_rev

In [16]:
# Process each page contained in the document.
fields.clear()
for pgnum, page in enumerate( doc.get_pages() ):
    interpreter.process_page(page)
    if page.annots:
        _build_annotations(pgnum, page )
    txt= _get_text( device )
    text[pgnum+1]= txt

In [22]:
for i in fields:
    if not i['subtype'] == 'Popup':
        pagenum = i['pagenum']
        pagenum = pagenum + 1
        subtype = i['subtype']
        print('pagenum: {}, annotation type: {}'.format(pagenum,subtype))

pagenum: 1, annotation type: Text
pagenum: 2, annotation type: Highlight
pagenum: 3, annotation type: FreeText
pagenum: 3, annotation type: Circle
pagenum: 4, annotation type: StrikeOut
pagenum: 4, annotation type: Polygon
pagenum: 6, annotation type: Ink
pagenum: 9, annotation type: Line
pagenum: 9, annotation type: Underline
