In [1]:
import logging
import os
import re
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast
import unicodedata

import pdfminer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.utils import isnumber

In [2]:
logging.basicConfig()
logger = logging.getLogger(__name__)

ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
def escape(s: Union[str, bytes]) -> str:
    if isinstance(s, bytes):
        us = str(s, "latin-1")
    else:
        us = s
    return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us)

def bintostr(b:bytes):
    if b is None:
        return ""
    return unicodedata.normalize('NFKD', b.decode("latin-1")).encode('ASCII', 'ignore').decode("ASCII")

In [3]:
def saveTxt(
    retrievedData: dict,
    outfilename: str
) -> None:
    map={}
    #PAGE 1
    map["D_NUM_EXPEDIENTE"]="P1_01.01"
    map["D_EXPEDIENTE_DM"]="P1_01.02"
    map["D_CIF"]="P1_01.03"
    map["D_COD_AAFF"]="P1_01.04"
    map["D_COD_GRUPO"]="P1_01.05"
    map["D_ACCION_FORMATIVA"]="P1_01.06"
    map["ID_MODALIDAD"]="P1_01.07"
    map["N_EDAD"]="P1_02.01"
    map["ID_SEXO"]="P1_02.02"
    map["ID_TITULACION"]="P1_02.03"
    map["D_OTRA_TITULACION"]="P1_03.10.01"
    map["Nivel de idiomas"]="P1_03.10.02"
    map["EspecificaotraTitulacion"]="P1_03.10.03"
    map["D_LUGAR_DE_TRABAJO"]="P1_03.04"
    map["ID_CATEGORIA_PROFESIONAL"]="P1_03.05"
    map["D_OTRA_CAT_PROF"]="P1_05.06"
    map["ID_HORARIO_CURSO"]="P1_06.00"
    map["ID_JORNADA"]="P1_06.01"
    map["ID_TAMANIO_PYME"]="P1_07"
    #PAGE 2
    map["ID_BUENA_ORGANIZACION"]="P2_01.01"
    map["ID_NUM_ALUMNOS_ADECUADO"]="P2_01.02"
    map["ID_CONTENIDOS_SEGUN_NECESIDADES"]="P2_02.01"
    map["ID_CONTENIDOS_SEGUN_TEORIA_PRACTICA"]="P2_02.02"
    map["ID_DURACION_SUFICIENTE"]="P2_03.01"
    map["ID_HORARIO_FAVORABLE"]="P2_03.02"
    map["ID_FORMADOR_FACILITADOR"]="P2_04.01.01"
    map["ID_FORMADOR_CONOCE_TEMA"]="P2_04.01.02"
    map["ID_TUTOR_FACILITADOR"]="P2_2.04.01.01"
    map["ID_TUTOR_CONOCE_TEMA"]="P2_04.01.02"
    map["ID_MEDIOS_COMPRENSIBLES"]="P2_05.01"
    map["ID_MEDIOS_ACTUALIZADOS"]="P2_05.02"
    map["ID_INSTALACIONES_ADECUADAS"]="P2_06.01"
    map["ID_MEDIOS_TECNICOS_ADECUADOS"]="P2_06.02"
    #PAGE 3
    map["ID_TELEFORMACION_GUIA"]="P3_07.01"
    map["ID_TELEFORMACION_MEDIOS"]="P3_07.02"
    map["ID_PRUEBAS_EVALUACION"]="P3_08.01"
    map["ID_ACREDITACION"]="P3_08.02"
    map["ID_INCORPORACION_MERCADO"]="P3_09.01"
    map["ID_CONOCIMIENTOS_PUESTO_TRABAJO"]="P3_09.02"
    map["Habilidades_trabajar_en_formado"]="P3_09.03"
    map["ID_PROGRESAR_CARRERA"]="P3_09.04"
    map["ID_DESARROLLO_PERSONAL"]="P3_09.05"
    map["ID_GRADO_SATISFACCION"]="P3_10"
    map["D_OBSERVACION"]="P3_11"
    map["F_CUMPLIMENTACION"]="P3_12"
    #
    outfp=open(outfilename , "w")
    mapped={}
    for k,v in retrievedData.items():
        mapped[map[k]+"_"+k]=v
    for k,v in sorted(mapped.items()):
        outfp.write("%s=%s\n" % (k, v.strip().replace("\r\n", ".")))
    outfp.close()

def processPdfAndSaveTxt(pdfname: str) -> None:
    retrievedData=processPdf(pdfname)
    ## Save map to file
    outfilename=pdfname + ".txt"
    saveTxt(retrievedData, outfilename)

In [4]:
def processPdf(
    fname: str
) -> dict:
    retrievedData = {}
    fp = open(fname, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    searchInAllObjects(retrievedData, doc)
    fp.close()
    return retrievedData

In [5]:
def searchInAllObjects(
    retrievedData: dict,
    doc: PDFDocument
) -> None:
    visited = set()
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited:
                continue
            visited.add(objid)
            try:
                obj = doc.getobj(objid)
                if obj is None:
                    continue
                searchInObject(retrievedData, obj, objid=objid)
            except PDFObjectNotFound as e:
                print("not found: %r" % e)
    return

In [6]:
def searchInObject(
    retrievedData: dict,
    obj: object, 
    objid: str = None
) -> None:
    understood=False
    if isinstance(obj, dict) and 'Type' in obj.keys():
        type=obj["Type"]
        if isinstance(type, PSLiteral):
            type=type.name
        #
        if type=="Annot":
            if 'T' in obj.keys() and 'V' in obj.keys():
                valueT=bintostr(obj["T"])
                valueV=bintostr(obj["V"])
                #
                if 'Opt' in obj.keys(): ##Combo
                    value=""
                    if isinstance(obj["Opt"], list):
                        for nestedList in obj["Opt"]:
                            if isinstance(nestedList, list) and len(nestedList)==2:
                                try:
                                    nestedListKey=bintostr(nestedList[0])
                                except:
                                    out.write('!! Error decoding nestedlist key %s \n' % nestedList[0])
                                try:
                                    nestedListValue=bintostr(nestedList[1])
                                except:
                                    out.write('!! Error decoding nestedlist value %s \n' % nestedList[1])
                                if nestedListKey==valueV:
                                    value=nestedListValue
                    understood=True
                    retrievedData[valueT]=value
                else: ##Campo de texto
                    understood=True
                    retrievedData[valueT]=valueV
        elif type in ["Pages", "Page", "Font", "FontDescriptor", "Encoding", "Outlines", "Catalog"]:
            understood=True
        else: ## type desconocido
            understood=True
            print("<unknown type=\"%s\"/>\n" % type)
    
if __name__ == "__main__":
    for file in os.listdir("."):
        if file.endswith(".pdf"):
            fileName=os.path.join(".", file)
            print("Processing %s" % fileName)
            processPdfAndSaveTxt(fileName)
    print("Done")


Processing .\Test.pdf
Done
