# Extrair Tabelas dos Documentos PDF
---
Glenn Abastillas

Este caderno tem o código que extrae tabelas dos documentos PDF.

In [9]:
from pytesseract import image_to_string
from pdf2image import convert_from_path
from tabula import read_pdf
from glob import glob
from pathlib import Path

### Arquivos e Archivos

In [10]:
arquivo = '../../data/pdf'
archivos = glob(arquivo + '/*Boletim*.pdf')

### Funções

In [45]:
def pages(archivo):
    ''' Da o número de páginas que um archivo tem '''
    documento = convert_from_path(archivo)
    return len(documento)

def read(archivo, paginas=1):
    ''' 
    Carga e leia as tabelas num archivo PDF 
    
    Parameters
    ----------
        archivo (str) : Path to PDF to extract tables from
        paginas (int) : Number of pages in the PDF to read
    
    Returns
    -------
        List of tables and page numbers
    '''
    tabelas = []
    
    for pagina in range(paginas):
        tabela = read_pdf(archivo, pages=pagina)
        
        if isinstance(tabela, pandas.core.frame.DataFrame):
            tabelas.append(tabela)

    return tabelas

def save(texts, filename, location=Path('../../data/text')):
    ''' 
    Passe pelo texto (lista) e concadena os partes individuais então salva-las 
    
    Parameters
    ----------
        texts (list) : Array of converted PDF text to save
        filename (str) : Name of output file
        location (str) : Path to output folder
    '''
    partes = []
    
    for n, conteudo in enumerate(texts):
        parte = f'\n\nPÁGINA NO DOCUMENTO ORIGINAL : {n}\n\n{conteudo}'
        partes.append(parte)
    
    documento = '\n'.join(partes)
    
    with open(location / filename, 'w') as escritor:
        escritor.write(documento)
    

In [46]:
data = read(archivos[0], pages(archivos[0]))

'pages' argument isn't specified.Will extract only from page 1 by default.
Got stderr: Apr 19, 2020 9:32:03 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Times New Roman,Bold are not implemented in PDFBox and will be ignored
Apr 19, 2020 9:32:03 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Arial are not implemented in PDFBox and will be ignored
Apr 19, 2020 9:32:03 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: Your current java version is: 1.8.0_77
Apr 19, 2020 9:32:03 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: To get higher rendering speed on old java 1.8 or 9 versions,
Apr 19, 2020 9:32:03 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   update to the latest 1.8 or 9 version (>= 1.8.0_191 or >= 9.0.4),
Apr 19, 2020 9:32:03 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or
Apr 19, 2020 9:32:03 PM org.apache.pdfbox.rende

AttributeError: 'list' object has no attribute 'values'

In [49]:
type(data[1][0])

pandas.core.frame.DataFrame

### Carga Archivos e Salvo o Texto

In [7]:
%%time
for archivo in archivos:
    texto = read(archivo)
    save(texto, Path(archivo).stem + '.txt')

CPU times: user 4.52 s, sys: 6.48 s, total: 11 s
Wall time: 8min 39s


---
# FIM

&nbsp;