IMPORTS AND INSTALL

In [1]:
# imports
from PyPDF2 import PdfReader
import pandas as pd
import re

# local do arquivo
file_path = 'RHPP05LA.PDF'

FUNÇÕES

In [2]:
def split_tables(
        tb: list, 
        split_word:str="", 
        pad_top_tb1:int=0,
        pad_bottom_tb1:int=0,
        pad_top_tb2:int=0,
        pad_bottom_tb2:int=0
        ):
    
    for i in range(len(tb)):
        if split_word in tb[i]:
            tb1 = tb[pad_top_tb1:i+pad_bottom_tb1]
            tb2 = tb[i+pad_top_tb2:] if pad_bottom_tb2==0 else tb[i+pad_top_tb2:pad_bottom_tb2] 
    return tb1, tb2

def normalize_text_cobol(tb: list) -> list:
    textos_normalizados_cobol = {
        "1/12 FERIAS"           : "ferias", 
        "1/12 ferias"           : "ferias", 
        "1/12 DE 1 /3 FERIAS"   : "13_ferias", 
        "1 /3 ferias"           : "13_ferias", 
        "1 /3 FERIAS"           : "13_ferias", 
        "1/12 DE 13_ferias"     : "13_ferias", 
        "TOTAL ferias"          : "total_ferias", 
        "TOTAL FERIAS"          : "total_ferias", 
        "1/12 13 SAL ARIO"      : "13_salario", 
        "13 SAL ARIO"           : "13_salario", 
        "T O T A L"             : "total", 
        "PROVISIONAMENTO"       : "apropriacao", 
        "INSS  20,00%"          : "inss", 
        "SP-PREVCOM"            : "spprevcom", 
        "AC TRAB  "             : "ac_trab-", 
        "FGTS 8%"               : "fgts", 
        "TOTAL DE ENCARGOS"     : "total_encargos", 
        "."                     : "", 
        ","                     : ".", 
        "SPPREV"                : "spprev", 
        "FERIAS"                : "ferias"
    }
    for i in range(len(tb)):
        for search, correct in textos_normalizados_cobol.items():
            tb[i] = tb[i].replace(search, correct.lower())
    return tb

def correct_data(datas:list) -> list:
    for data in datas:
        i = 0
        while i < len(data) - 1:
            if "." not in data[i]:
                data[i] += data[i+1]
                del data[i+1]
            else:
                i += 1
    return datas

def normalize_data_items(items:list) ->list:
    return ["ac_trab" if "ac_trab-" in item else item for item in items]

def extract_tables(tb:list):
    tb = normalize_text_cobol(tb)

    headers = tb[0].split()

    datas = tb[1:]
    for i in range(len(datas)):
        datas[i] = datas[i].split()
    
    items = [data[0] for data in datas]
    items = normalize_data_items(items)

    datas = [data[1:] for data in datas]
    datas = correct_data(datas)

    return headers, items, datas

def cast_table_to_df(tb:list) -> pd.DataFrame:
    headers, items, datas = extract_tables(tb)
    dictionary = {' ': items}

    for header, collumn in zip(headers, zip(*datas)):
        dictionary[header] = float(collumn) if isinstance(collumn, float) else collumn

    return pd.DataFrame(dictionary)

def cast_table_to_tuple(tb:list) -> tuple:
    tup = {}
    headers, items, datas = extract_tables(tb)
    
    for type, data in zip(items, datas):
        tup[type] = {}
        for header, value in zip(headers, data):
            tup[type][header] = float(value)

    return tup   

def get_cobol_data_from(file_path:str):
    results = {}
    
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        pages = pdf_reader.pages

        for number_page, page in enumerate(pages, start=1):
            text = page.extract_text()
            lines = text.split("\n")
            lines = [line for line in lines if not line.isspace()]

            op_code = int(lines[3].strip().split(" - ")[0].replace(" ", ""))
            op_name = lines[3].strip().split(" - ")[1]

            if(number_page %2 == 1):
                tb_clt, tb_aut = split_tables(lines, "A U T A R Q U I C O", 7, -1, 2, -1)

                tb_clt_apropriado, tb_clt_realizado = split_tables(tb_clt, "R E V E R S A O", 1, 0, 1)
                tb_aut_apropriado, tb_aut_realizado = split_tables(tb_aut, "R E V E R S A O", 1, 0, 1)

                tb_clt_apropriado   = cast_table_to_df(tb_clt_apropriado)
                tb_clt_realizado    = cast_table_to_df(tb_clt_realizado)
                tb_aut_apropriado   = cast_table_to_df(tb_aut_apropriado)
                tb_aut_realizado    = cast_table_to_df(tb_aut_realizado)

                results[op_code] = {
                    "codigo"    : op_code,
                    "unidade"   : op_name,
                    "clt" : {
                        "apropriado"    : tb_clt_apropriado,
                        "realizado"     : tb_clt_realizado
                    }, 
                    "autarquico" : {
                        "apropriado"    : tb_aut_apropriado,
                        "realizado"     : tb_aut_realizado
                    }
                }
    return results

EXECUÇÃO PRINCIPAL

In [3]:
results = get_cobol_data_from("RHPP05LA.PDF")

code = 99998

display(results[code]['clt']['apropriado'])
display(results[code]['clt']['realizado'])
display(results[code]['autarquico']['apropriado'])
display(results[code]['autarquico']['realizado'])

Unnamed: 0,Unnamed: 1,ferias,13_ferias,total_ferias,13_salario,total
0,apropriacao,189086.18,63028.79,252114.97,189086.18,441201.15
1,inss,28138.69,9379.5,37518.19,28138.69,65656.88
2,ac_trab,702.74,234.26,937.0,702.74,1639.74
3,fgts,11255.03,3751.66,15006.69,11255.03,26261.72
4,spprevcom,0.0,0.0,0.0,2598.55,2598.55
5,total_encargos,40096.46,13365.42,53461.88,42695.01,96156.89
6,total,229182.64,76394.21,305576.85,231781.19,537358.04


Unnamed: 0,Unnamed: 1,ferias,13_ferias,total_ferias,13_salario,total
0,apropriacao,355099.47,118365.12,473464.59,1398.34,474862.93
1,inss,64466.55,21487.46,85954.01,279.66,86233.67
2,ac_trab,1611.85,536.32,2148.17,6.98,2155.15
3,fgts,25835.3,8610.41,34445.71,111.86,34557.57
4,spprevcom,0.0,0.0,0.0,0.0,0.0
5,total_encargos,91913.7,30634.19,122547.89,398.5,122946.39
6,total,447013.17,148999.31,596012.48,1796.84,597809.32


Unnamed: 0,Unnamed: 1,ferias,13_ferias,total_ferias,13_salario,total
0,apropriacao,424601.22,141533.72,566134.94,424601.22,990736.16
1,spprev,79748.71,0.0,79748.71,79748.71,159497.42
2,spprevcom,0.0,0.0,0.0,800.42,800.42
3,total_encargos,79748.71,0.0,79748.71,80549.13,160297.84
4,total,504349.93,141533.72,645883.65,505150.35,1151034.0


Unnamed: 0,Unnamed: 1,ferias,13_ferias,total_ferias,13_salario,total
0,apropriacao,0.0,0.0,0.0,0.0,0.0
1,spprev,0.0,0.0,0.0,0.0,0.0
2,spprevcom,0.0,0.0,0.0,0.0,0.0
3,total_encargos,0.0,0.0,0.0,0.0,0.0
4,total,0.0,0.0,0.0,0.0,0.0
