In [1]:
import logging
import os.path
import re
import sys
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast
from argparse import ArgumentParser

import pdfminer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.utils import isnumber

In [2]:
logging.basicConfig()
logger = logging.getLogger(__name__)

ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')


def escape(s: Union[str, bytes]) -> str:
    if isinstance(s, bytes):
        us = str(s, "latin-1")
    else:
        us = s
    return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us)

In [3]:
def dumpallobjs(
    out: TextIO,
    doc: PDFDocument,
    show_fallback_xref: bool = False,
) -> None:
    visited = set()
    out.write("<pdf>\n")
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited:
                continue
            visited.add(objid)
            try:
                obj = doc.getobj(objid)
                if obj is None:
                    continue
                dumpobj(out, obj, objid=objid)
            except PDFObjectNotFound as e:
                print("not found: %r" % e)
    out.write("</pdf>\n")
    return

In [4]:
def dumppdf(
    outfp: TextIO,
    fname: str) -> None:
    fp = open(fname, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    dumpallobjs(outfp, doc)
    fp.close()
    outfp.write("\n")
    return

In [5]:
import unicodedata
def bintostr(b:bytes):
    if b is None:
        return ""
    return unicodedata.normalize('NFKD', b.decode("latin-1")).encode('ASCII', 'ignore').decode("ASCII")

def dumpobj(out: TextIO, obj: object, objid: str = None) -> None:
    understood=False
    if isinstance(obj, dict) and 'Type' in obj.keys():
        type=obj["Type"]
        if isinstance(type, PSLiteral):
            type=type.name
        #
        if type=="Annot":
            if 'T' in obj.keys() and 'V' in obj.keys():
                valueT=bintostr(obj["T"])
                valueV=bintostr(obj["V"])
                #
                if 'Opt' in obj.keys(): ##Combo
                    value=""
                    if isinstance(obj["Opt"], list):
                        for nestedList in obj["Opt"]:
                            if isinstance(nestedList, list) and len(nestedList)==2:
                                try:
                                    nestedListKey=bintostr(nestedList[0])
                                except:
                                    out.write('!! Error decoding nestedlist key %s \n' % nestedList[0])
                                try:
                                    nestedListValue=bintostr(nestedList[1])
                                except:
                                    out.write('!! Error decoding nestedlist value %s \n' % nestedList[1])
                                if nestedListKey==valueV:
                                    value=nestedListValue
                    understood=True
                    out.write("\t<field name=\"%s\" value=\"%s\"/>\n" %(valueT, value))
                else: ##Campo de texto
                    understood=True
                    out.write("\t<field name=\"%s\" value=\"%s\"/>\n" %(valueT, valueV))
        elif type in ["Pages", "Page", "Font", "FontDescriptor", "Encoding", "Outlines", "Catalog"]:
            understood=True
            #out.write("<ignored type=\"%s\"/>\n" % type)
        else: ## type desconocido
            understood=True
            out.write("<unknown type=\"%s\"/>\n" % type)

    #if not understood:
    #    out.write('<object id="%d">\n' % objid)
    #    dumpxml(out, obj, objid=objid)
    #    out.write("\n</object>\n\n")

def dumpxml(out: TextIO, obj: object, objid: str = None) -> None:
    if obj is None:
        out.write("<null />")
        return

    if isinstance(obj, dict):
        out.write('<dict size="%d">\n' % len(obj))
        for (k, v) in obj.items():
            out.write("<entry key=\"%s\">" % k)
            dumpxml(out, v)
            out.write("</entry>\n")
        out.write("</dict>")
        return

    if isinstance(obj, list):
        out.write('<list size="%d">\n' % len(obj))
        for v in obj:
            dumpxml(out, v)
            out.write("\n")
        out.write("</list>")
        return

    if isinstance(obj, (str, bytes)):
        out.write('<string size="%d">%s</string>' % (len(obj), escape(obj)))
        return

    if isinstance(obj, PDFStream):
        out.write("<stream>\n<props>\n")
        dumpxml(out, obj.attrs)
        out.write("\n</props>\n")
        data = obj.get_data()
        out.write('<data size="%d">%s</data>\n' % (len(data), escape(data)))
        out.write("</stream>")
        return

    if isinstance(obj, PDFObjRef):
        out.write('<ref id="%d" />' % obj.objid)
        return

    if isinstance(obj, PSKeyword):
        # Likely bug: obj.name is bytes, not str
        out.write("<keyword>%s</keyword>" % obj.name)  # type: ignore [str-bytes-safe]
        return

    if isinstance(obj, PSLiteral):
        # Likely bug: obj.name may be bytes, not str
        out.write("<literal>%s</literal>" % obj.name)  # type: ignore [str-bytes-safe]
        return

    if isnumber(obj):
        out.write("<number>%s</number>" % obj)
        return

    raise TypeError(obj)

def main(pdfname: str) -> None:
    outfilename= pdfname + ".txt"
    outfp=open(outfilename , "w")
    dumppdf(
        outfp,
        pdfname,
    )
    outfp.close()
    print ("Generated file: " + outfilename)
    
if __name__ == "__main__":
    main("Test.pdf")


Generated file: Test.pdf.txt
