IMPORTS AND INSTALL

In [1]:
# imports
from PyPDF2 import PdfReader
from PyPDF2 import PageObject
import pandas as pd

# local do arquivo
file_path = 'RHPP05LA.PDF'

FUNÇÕES

In [2]:
def get_normalized_cobol_texts():
    textos_normalizados_cobol = {
        "1/12 FERIAS"           : "ferias", 
        "1/12 ferias"           : "ferias", 
        "1/12 DE 1 /3 FERIAS"   : "13_ferias", 
        "1 /3 ferias"           : "13_ferias", 
        "1 /3 FERIAS"           : "13_ferias", 
        "1/12 DE 13_ferias"     : "13_ferias", 
        "TOTAL ferias"          : "total_ferias", 
        "TOTAL FERIAS"          : "total_ferias", 
        "1/12 13 SAL ARIO"      : "13_salario", 
        "13 SAL ARIO"           : "13_salario", 
        "T O T A L"             : "total", 
        "PROVISIONAMENTO"       : "apropriacao", 
        "INSS  20,00%"          : "inss", 
        "SP-PREVCOM"            : "spprevcom", 
        "AC TRAB  "             : "ac_trab-", 
        "FGTS 8%"               : "fgts", 
        "TOTAL DE ENCARGOS"     : "total_encargos", 
        "."                     : "", 
        ","                     : ".", 
        "SPPREV"                : "spprev", 
        "FERIAS"                : "ferias"
    }
    return textos_normalizados_cobol

def split_tables(
        tb: list, 
        split_word:str="", 
        pad_top_tb1:int=0,
        pad_bottom_tb1:int=0,
        pad_top_tb2:int=0,
        pad_bottom_tb2:int=0
        ):
    for i in range(len(tb)):
        if split_word in tb[i]:
            tb1 = tb[pad_top_tb1:i+pad_bottom_tb1]
            tb2 = tb[i+pad_top_tb2:] if pad_bottom_tb2==0 else tb[i+pad_top_tb2:pad_bottom_tb2] 
    return tb1, tb2

def normalize_text_cobol(tb: list) -> list:
    for i in range(len(tb)):
        for search, correct in get_normalized_cobol_texts().items():
            tb[i] = tb[i].replace(search, correct.lower())
    return tb

def correct_data(datas:list) -> list:
    for data in datas:
        i = 0
        while i < len(data) - 1:
            if "." not in data[i]:
                data[i] += data[i+1]
                del data[i+1]
            else:
                i += 1
    return datas

def normalize_data_items(items:list) ->list:
    return ["ac_trab" if "ac_trab-" in item else item for item in items]

def extract_tables(tb:list):
    tb = normalize_text_cobol(tb)
    headers = tb[0].split()
    datas = tb[1:]
    for i in range(len(datas)):
        datas[i] = datas[i].split()
    items = [data[0] for data in datas]
    items = normalize_data_items(items)
    datas = [data[1:] for data in datas]
    datas = correct_data(datas)
    return headers, items, datas

def cast_table_to_df(tb:list) -> pd.DataFrame:
    headers, items, datas = extract_tables(tb)
    dictionary = {' ': items}
    for header, column in zip(headers, zip(*datas)):
        dictionary[header] = float(column) if isinstance(column, float) else column
    return pd.DataFrame(dictionary)

def extract_text_from_page(page:PageObject):
    text = page.extract_text()
    lines = text.split("\n")
    lines = [line for line in lines if not line.isspace()]
    return lines

def get_op(lines:list[str]):
    op_code = int(lines[3].strip().split(" - ")[0].replace(" ", ""))
    op_name = lines[3].strip().split(" - ")[1]
    return op_code, op_name

def get_subtables(lines:list[str]):
    tb_clt, tb_aut = split_tables(lines, "A U T A R Q U I C O", 7, -1, 2, -1)
    tb_clt_apropriado, tb_clt_realizado = split_tables(tb_clt, "R E V E R S A O", 1, 0, 1)
    tb_aut_apropriado, tb_aut_realizado = split_tables(tb_aut, "R E V E R S A O", 1, 0, 1)
    tb_clt_apropriado   = cast_table_to_df(tb_clt_apropriado)
    tb_clt_realizado    = cast_table_to_df(tb_clt_realizado)
    tb_aut_apropriado   = cast_table_to_df(tb_aut_apropriado)
    tb_aut_realizado    = cast_table_to_df(tb_aut_realizado)
    return tb_clt_apropriado, tb_clt_realizado, tb_aut_apropriado, tb_aut_realizado

def add_result(results:dict, op_code:int, op_name:str, 
               tb_clt_apropriado:pd.DataFrame, tb_clt_realizado:pd.DataFrame, 
               tb_aut_apropriado:pd.DataFrame, tb_aut_realizado:pd.DataFrame):
    results[op_code] = {
                    "codigo"    : op_code,
                    "unidade"   : op_name,
                    "clt" : {
                        "apropriado"    : tb_clt_apropriado,
                        "realizado"     : tb_clt_realizado
                    }, 
                    "autarquico" : {
                        "apropriado"    : tb_aut_apropriado,
                        "realizado"     : tb_aut_realizado
                    }
                }
    return results

def get_cobol_data_from(file_path:str):
    results = {}
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        pages = pdf_reader.pages
        for number_page, page in enumerate(pages, start=1):
            lines = extract_text_from_page(page)
            op_code, op_name = get_op(lines)
            if(number_page %2 == 1):
                tb_clt_apropriado, tb_clt_realizado, tb_aut_apropriado, tb_aut_realizado = get_subtables(lines)
                results = add_result(results, op_code, op_name, tb_clt_apropriado, tb_clt_realizado, tb_aut_apropriado, tb_aut_realizado)
    return results

EXECUÇÃO PRINCIPAL

In [3]:
results = get_cobol_data_from("RHPP05LA.PDF")

code = 99998

display(results[code]['clt']['apropriado'])
display(results[code]['clt']['realizado'])
display(results[code]['autarquico']['apropriado'])
display(results[code]['autarquico']['realizado'])

TypeError: float() argument must be a string or a real number, not 'list'