<br>

# Introdução


In [None]:
#!pip3 install PyPDF2 --upgrade
#!pip3 install screeninfo

In [None]:
import os
import re
import shutil
from zipfile import ZipFile
from pathlib import Path
from PyPDF2 import PdfMerger, PdfReader


<br>

## Paths


In [None]:
def set_directories(zipfile_file):
    # Paths
    input_path = os.path.dirname(os.path.abspath(zipfile_file))

    while not zipfile_file.lower().endswith(('.zip', '.ZIP')):
        print('Erro: Selecionar um arquivo .zip')
        return 0, 0, 0

    # Paths
    temp = os.path.basename(zipfile_file).replace('.zip', '')
    output_path = os.path.join(input_path, temp)
    output_apartados_path = os.path.join(input_path, temp, 'apartados')

    # Clean Directories
    shutil.rmtree(output_path, ignore_errors=True)

    # Make Directories
    os.makedirs(output_path, exist_ok=True)
    os.makedirs(output_apartados_path, exist_ok=True)
    return input_path, output_path, output_apartados_path


In [None]:
# Major Path
zipfile_file = os.path.join(
    '..', 'data', '1010642-60.2020.8.26.0019 pequeno.zip'
)
input_path, output_path, output_apartados_path = set_directories(zipfile_file)


<br>

# Zip


In [None]:
def unzip_zipfile(zipfile_file, output_path):
    """
    Para descompactar apenas um arquivo .zip específico
    """
    print('> Etapa 1: Descompacta arquivo')
    if not os.path.isfile(zipfile_file):
        print('É necessário selecionar um arquivo .zip')

    if not os.path.isdir(output_path):
        print('É necessário selecionar uma pasta')

    try:
        with ZipFile(zipfile_file, 'r') as zipObj:
            # Extract all the contents of zip file in different directory
            zipObj.extractall(output_path)
        msg = '> Etapa 1: Concluída.'
    except Exception as e:
        msg = 'Erro: {}'.format(e)
    print(msg)
    return msg


In [None]:
# ddd
zipfile_file = os.path.join(
    '..', 'data', '1010642-60.2020.8.26.0019 pequeno.zip'
)
unzip_zipfile(zipfile_file, output_apartados_path)


<br>

## Função

Função que extrai todo o conteúdo que se encontra entre as palabras "pag " e ").pdf".
Necessária para obter o número da primeira página do documento.


In [None]:
def extract_text(filename):
    """
    Função que pega o nome do arquivo e cria o nome nome, utilizando
    a primeira página dos documentos extraidos dos documentos do e-SAJ
    imediatamente após o "(pag "...
    """
    # Prefixo e Sufixo da busca, para ser deletado
    start = 'pag '
    end = '.pdf'

    # Deleta
    text = re.search('(?<={}).*?(?={})'.format(start, end), filename)
    text = text.group()
    text = text.replace(')', '')
    page_1st = text.split(' - ')[0]
    page_1st = int(page_1st)
    return '{} - {}'.format(page_1st, filename)


In [None]:
# Test
for i in [
    'Documento 25 (pag 1249 - 1251).pdf',
    'Administrativa (pag 2641).pdf',
]:
    print(extract_text(i))


<br>

# Principal


In [None]:
def get_n_files(input_path):
    n_files = 0
    for path, dirs, files in os.walk(input_path):
        n_files += len(files)
    return n_files


In [None]:
def rename_files(input_files_path):
    print('> Etapa 2: Renomear Arquivos')

    # Parameters
    n_file = 0
    n_files = get_n_files(input_files_path)

    # Loop
    for path, dirs, files in os.walk(input_files_path):
        for file in files:
            n_file += 1
            input_file = os.path.join(path, file)
            output_filename = extract_text(file)
            output_file = os.path.join(input_files_path, output_filename)
            print(
                'Arquivo {} de {} renomeado - {}% concluído. Aguarde.'.format(
                    n_file, n_files, int(n_file / n_files * 100)
                )
            )
            os.rename(input_file, output_file)

    # Results
    print('> Etapa 2: Concluída.')


In [None]:
# Renomeia os arquivos
rename_files(output_apartados_path)


<br>

# Merge PDFs


In [None]:
def adjust_bookmark(filename):
    """
    Cria um nome para o bookmark a partir do nome do arquivo.
    """
    bookmark = filename.split(' - ', maxsplit=1)[-1]
    bookmark = bookmark.replace('.pdf', '')
    return bookmark


In [None]:
# Test
adjust_bookmark('208 - Documento 4 (pag 208 - 209).pdf')


In [None]:
def get_int(name):
    """
    Custom Function to sort list of files, based in their names.
    """
    num = name.split(' - ')[0]
    return int(num)


def sort_files_as_list(path):
    """
    Sort list os files in directort
    """
    # List Only Files
    # list_files = os.listdir(path) # Dá erro pois pega pastas tb!
    list_files = [
        f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))
    ]

    # Sort
    list_files.sort(key=get_int)
    return list_files


In [None]:
# Sort Files
list_files = sort_files_as_list(output_apartados_path)


In [None]:
def merge_files(input_files_path, output_file_path, filename):
    # Append PDF files sorted
    print('> Etapa 3: Unifica Arquivos')

    # List Files
    list_files = sort_files_as_list(input_files_path)

    # Parameters
    n_file = 0
    n_files = get_n_files(input_files_path)

    # Call the PdfFileMerger
    merged_object = PdfMerger()

    # Loop
    for file in list_files:
        n_file += 1
        bookmark = adjust_bookmark(file)
        merged_object.append(
            PdfReader(os.path.join(input_files_path, file), 'rb'),
            bookmark,
        )
        print(
            f'Arquivo {n_file} de {n_files} juntado - {int(n_file/n_files*100)}% concluído. Aguarde.'
        )

    # Write all the files into a file which is named as shown below
    merged_object.write(os.path.join(output_file_path, filename))

    # Fim
    print('> Etapa 3: Concluída.')
    return 0


In [None]:
def create_output_filename(zipfile_file):
    output_filename = os.path.basename(zipfile_file)
    output_filename = output_filename.replace('.zip', '.pdf').replace(
        '.ZIP', '.pdf'
    )
    return output_filename


In [None]:
# Output Filename
output_filename = create_output_filename(zipfile_file)
output_filename


In [None]:
# Merge Files
merge_files(output_apartados_path, input_path, output_filename)

# Clean Directories
shutil.rmtree(output_path, ignore_errors=True)


In [None]:
if __name__ == '__main__':
    print('Módulo Jupyter')


<br>

# Export


In [None]:
import os
from traitlets.config import Config
from nbconvert import PythonExporter
from nbconvert.preprocessors import TagRemovePreprocessor


In [None]:
filename = 'esaj_functions.ipynb'
notebook = os.path.join(os.getcwd(), filename)


In [None]:
# Import the exporter
c = Config()
c.TagRemovePreprocessor.enabled = True
c.ClearOutputPreprocessor.enabled = True
c.TemplateExporter.exclude_markdown = True
c.TemplateExporter.exclude_code_cell = False
c.TemplateExporter.exclude_input_prompt = True
c.TemplateExporter.exclude_output = True
c.TemplateExporter.exclude_raw = True
c.TagRemovePreprocessor.remove_cell_tags = ('remove_cell',)
c.TagRemovePreprocessor.remove_input_tags = ('remove_cell',)
c.TagRemovePreprocessor.remove_all_outputs_tags = ('remove_output',)
c.preprocessors = ['TagRemovePreprocessor']
c.PythonExporter.preprocessors = [
    'nbconvert.preprocessors.TagRemovePreprocessor'
]

# Configure and run out exporter
py_exporter = PythonExporter(config=c)
py_exporter.register_preprocessor(TagRemovePreprocessor(config=c), True)

# Configure and run out exporter - returns a tuple - first element with html, second with notebook metadata
body, metadata = PythonExporter(config=c).from_filename(notebook)

# Write to output html file
with open(
    os.path.join('..', 'src', 'esaj_functions.py'), 'w', encoding='utf-8'
) as f:
    f.write(body)
