IMPORTS AND INSTALL

In [19]:
# imports
from PyPDF2 import PdfReader
import pandas as pd

# local do arquivo
file_path = 'RHPP05LA.PDF'

FUNÇÕES

In [20]:
def split_tables(
        tb: list, 
        split_word:str="", 
        pad_top_tb1:int=0,
        pad_bottom_tb1:int=0,
        pad_top_tb2:int=0,
        pad_bottom_tb2:int=0
        ):
    
    for i in range(len(tb)):
        if split_word in tb[i]:
            tb1 = tb[pad_top_tb1:i+pad_bottom_tb1]
            tb2 = tb[i+pad_top_tb2:] if pad_bottom_tb2==0 else tb[i+pad_top_tb2:pad_bottom_tb2] 
    return tb1, tb2

def normalize_text_cobol(tb: list) -> list:
    textos_normalizados_cobol = {
        "1/12 FERIAS"           : "ferias", 
        "1/12 ferias"           : "ferias", 
        "1/12 DE 1 /3 FERIAS"   : "13_ferias", 
        "1 /3 ferias"           : "13_ferias", 
        "1 /3 FERIAS"           : "13_ferias", 
        "1/12 DE 13_ferias"     : "13_ferias", 
        "TOTAL ferias"          : "total_ferias", 
        "TOTAL FERIAS"          : "total_ferias", 
        "1/12 13 SAL ARIO"      : "13_salario", 
        "13 SAL ARIO"           : "13_salario", 
        "T O T A L"             : "total", 
        "PROVISIONAMENTO"       : "apropriacao", 
        "INSS  20,00%"          : "inss", 
        "SP-PREVCOM"            : "spprevcom", 
        "AC TRAB  "             : "ac_trab-", 
        "FGTS 8%"               : "fgts", 
        "TOTAL DE ENCARGOS"     : "total_encargos", 
        "."                     : "", 
        ","                     : ".", 
        "SPPREV"                : "spprev", 
        "FERIAS"                : "ferias"
    }

    for i in range(len(tb)):
        for search, correct in textos_normalizados_cobol.items():
            tb[i] = tb[i].replace(search, correct.lower())
    return tb

def correct_data(datas:list) -> list:
    for data in datas:
        i = 0
        while i < len(data) - 1:
            if "." not in data[i]:
                data[i] += data[i+1]
                del data[i+1]
            else:
                i += 1
    return datas

def extract_tables(tb:list):
    tb = normalize_text_cobol(tb)

    headers = tb[0].split()

    datas = tb[1:]
    for i in range(len(datas)):
        datas[i] = datas[i].split()
    
    items = [data[0] for data in datas]

    datas = [data[1:] for data in datas]
    datas = correct_data(datas)

    return headers, items, datas

def cast_table_to_df(tb:list) -> pd.DataFrame:
    headers, items, datas = extract_tables(tb)

    dictionary = {' ': items}

    for header, collumn in zip(headers, zip(*datas)):
        dictionary[header] = float(collumn) if isinstance(collumn, float) else collumn

    return pd.DataFrame(dictionary)

def cast_table_to_tuple(tb:list) -> tuple:
    headers, items, datas = extract_tables(tb)
    
    tup = {}
    
    for type, data in zip(items, datas):
        tup[type] = {}
        for header, value in zip(headers, data):
            tup[type][header] = float(value)

    return tup   

def get_cobol_data_from(file_path:str):
    results = {}
    
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        pages = pdf_reader.pages

        for number_page, page in enumerate(pages, start=1):
            text = page.extract_text()
            lines = text.split("\n")
            lines = [line for line in lines if not line.isspace()]

            op_code = int(lines[3].strip().split(" - ")[0].replace(" ", ""))
            op_name = lines[3].strip().split(" - ")[1]

            if(number_page %2 == 1):
                tb_clt, tb_aut = split_tables(lines, "A U T A R Q U I C O", 7, -1, 2, -1)

                tb_clt_apropriado, tb_clt_realizado = split_tables(tb_clt, "R E V E R S A O", 1, 0, 1)
                tb_aut_apropriado, tb_aut_realizado = split_tables(tb_aut, "R E V E R S A O", 1, 0, 1)

                tb_clt_apropriado   = cast_table_to_df(tb_clt_apropriado)
                tb_clt_realizado    = cast_table_to_df(tb_clt_realizado)
                tb_aut_apropriado   = cast_table_to_df(tb_aut_apropriado)
                tb_aut_realizado    = cast_table_to_df(tb_aut_realizado)

                results[op_code] = {
                    "codigo"    : op_code,
                    "unidade"   : op_name,
                    "clt" : {
                        "apropriado"    : tb_clt_apropriado,
                        "realizado"     : tb_clt_realizado
                    }, 
                    "autarquico" : {
                        "apropriado"    : tb_aut_apropriado,
                        "realizado"     : tb_aut_realizado
                    }
                }        
    return results

EXECUÇÃO PRINCIPAL

In [21]:
results = get_cobol_data_from("RHPP05LA.PDF")

# display(results)
display(results[40434]['clt']['realizado'])

Unnamed: 0,Unnamed: 1,ferias,13_ferias,total_ferias,13_salario,total
0,apropriacao,395869.72,131954.37,527824.09,13408.63,541232.72
1,inss,73436.62,24477.13,97913.75,2630.23,100543.98
2,ac_trab-0.5000%,1841.74,611.38,2453.12,65.66,2518.78
3,fgts,29549.08,9847.95,39397.03,1052.06,40449.09
4,spprevcom,0.0,0.0,0.0,0.0,0.0
5,total_encargos,104827.44,34936.46,139763.9,3747.95,143511.85
6,total,500697.16,166890.83,667587.99,17156.58,684744.57
