## Here is a project of Python script to manage and organize pages in PDF files.

>Developed by [@edyatl](https://github.com/edyatl) February 2023 <edyatl@yandex.ru>

In [1]:
# Load Jupyter extension for auto correction coding style based on Black Lib
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Install a pip package in the current Jupyter kernel
# import sys

# !{sys.executable} -m pip install -U PyPDF2

<IPython.core.display.Javascript object>

In [23]:
import os
import glob
from PyPDF2 import PdfReader, PdfWriter


<IPython.core.display.Javascript object>

In [4]:
gost_size = [
    ["A0", 1189, 841],
    ["A1", 841, 594],
    ["A2", 594, 420],
    ["A3", 420, 297],
    ["A4", 297, 210],
    ["A4х3", 630, 297],
    ["A4х4", 841, 297],
    ["A4х5", 1051, 297],
    ["A4х6", 1261, 297],
    ["A4х7", 1471, 297],
    ["A4х8", 1682, 297],
    ["A4х9", 1892, 297],
    ["A3х3", 891, 420],
    ["A3х4", 1189, 420],
    ["A3х5", 1486, 420],
    ["A3х6", 1783, 420],
    ["A3х7", 2080, 420],
    ["A2х3", 1261, 594],
    ["A2х4", 1682, 594],
    ["A2х5", 2102, 594],
    ["A1х3", 1783, 841],
    ["A1х3", 2378, 841],
    ["A0х2", 1682, 1189],
    ["A0х3", 2523, 1189],
]
PaperSizes = {  # add new: ensure that first number is <= second number
    "A0": [2384, 3370],
    "A1": [1684, 2384],
    "A2": [1190, 1684],
    "A3": [842, 1190],
    "A4": [595, 842],
    "A5": [420, 595],
    "A6": [298, 420],
    "A7": [210, 298],
    "A8": [148, 210],
    "B0": [2835, 4008],
    "B1": [2004, 2835],
    "B2": [1417, 2004],
    "B3": [1001, 1417],
    "B4": [709, 1001],
    "B5": [499, 709],
    "B6": [354, 499],
    "B7": [249, 354],
    "B8": [176, 249],
    "B9": [125, 176],
    "B10": [88, 125],
    "C2": [1837, 578],
    "C3": [578, 919],
    "C4": [919, 649],
    "C5": [649, 459],
    "C6": [459, 323],
    "Invoice": [396, 612],
    "Executive": [522, 756],
    "Letter": [612, 792],
    "Legal": [612, 1008],
    "Ledger": [792, 1224],
    "A4х3": [842, 1785],
    "A4х4": [842, 2383],
    "A4х5": [842, 2978],
    "A4х6": [842, 3573],
    "A4х7": [842, 4168],
    "A4х8": [842, 4766],
    "A4х9": [842, 5361],
    "A3х3": [1190, 2526],
    "A3х4": [1190, 3371],
    "A3х5": [1190, 4213],
    "A3х6": [1190, 5055],
    "A3х7": [1190, 5897],
    "A2х3": [1684, 3573],
    "A2х4": [1684, 4766],
    "A2х5": [1684, 5956],
    "A1х3": [2384, 5055],
    "A1х3": [2384, 6742],
    "A0х2": [3370, 4768],
    "A0х3": [3370, 7152],
}
# Add dimensions of customer's formats converted from mm
# for fmt in gost_size:
#     if len(fmt[0]) > 2 and fmt[0][1:3] == "0х":
#         print(
#             f"'{fmt[0]}': [{PaperSizes['A0'][1]}, {fmt[1]/gost_size[0][2]*PaperSizes['A0'][0]:.0f}],"
#         )

<IPython.core.display.Javascript object>

In [5]:
pdf_src = os.path.join(os.path.abspath("../"), "шаблон/Binder1.pdf")

<IPython.core.display.Javascript object>

### TODO

* ~~Collect information on the total number of pages of each format~~
* ~~Add dimensions of customer's formats converted from mm~~
* ~~Write PDF with pages only one size~~
* Recursively get all PDF filenames with full path from a given directory

In [8]:
def find_fmt(w, h, orient: bool = True):
    wi = int(round(w, 0))
    hi = int(round(h, 0))
    if w <= h:
        w1 = wi
        h1 = hi
    else:
        w1 = hi
        h1 = wi

    sw = str(w1)
    sh = str(h1)
    stab = {(abs(w1 - s[0]) + abs(h1 - s[1])): key for key, s in PaperSizes.items()}

    small = min(stab.keys())
    f = stab[small]

    if w <= h:
        ff = f + "-P" if orient else f
        ss = str(PaperSizes[f][0]) + " x " + str(PaperSizes[f][1])
    else:
        ff = f + "-L" if orient else f
        ss = str(PaperSizes[f][1]) + " x " + str(PaperSizes[f][0])

    if small == 0 or small == 1:  # exact fit
        return ff
    rtxt = f"{sw} x {sh} (other), closest: {ff} = {ss}"  # else show best fit
    return rtxt

<IPython.core.display.Javascript object>

In [40]:
# Collect information on the total number of pages of each format
def get_format_info(file_path):
    reader = PdfReader(file_path)
    format_info = {}
    for pg in reader.pages:
        fmt = find_fmt(pg.mediabox.width, pg.mediabox.height, False)
        if fmt in format_info:
            format_info[fmt] += 1
        else:
            format_info[fmt] = 1

    print("{:^9} {:^9}\n--------  --------".format("Format", "Count"))
    for fmt, cnt in sorted(format_info.items()):
        print(f"{fmt:^9} {cnt:^9}")
    return format_info

<IPython.core.display.Javascript object>

In [41]:
format_info = get_format_info(pdf_src)

 Format     Count  
--------  --------
   A0         3    
   A1         1    
   A2         1    
   A4         3    


<IPython.core.display.Javascript object>

In [22]:
def write_fmt_file(fmt, pages, limit: int = 0):
    writer = PdfWriter()
    for pg in pages:
        if find_fmt(pg.mediabox.width, pg.mediabox.height, False) == fmt:
            writer.add_page(pg)

    # Add the metadata
    writer.add_metadata(
        {
            "/Creator": "PDFSort",
            "/Producer": "PDFSort",
        }
    )
    if limit > 0 and limit < len(writer.pages):
        np = len(writer.pages)
        fnum = np // limit + 1
        start: int = 0
        stop: int = limit
        for i in range(fnum):
            subwriter = PdfWriter()
            for wpg in writer.pages[start : stop if stop <= np else np]:
                subwriter.add_page(wpg)
            # Save the new PDF to a file with index
            with open(f"current_dir_{fmt}_pdf-{i}.pdf", "wb") as f:
                subwriter.write(f)
            start += limit
            stop += limit
            subwriter.close()
    else:
        # Save the new PDF to a file
        with open(f"current_dir_{fmt}_pdf.pdf", "wb") as f:
            writer.write(f)
    writer.close()

<IPython.core.display.Javascript object>

In [21]:
for fmt in format_info:
    print(fmt)
    write_fmt_file(fmt, reader.pages, 2)

A4
A2
A0
A1


<IPython.core.display.Javascript object>

In [37]:
def list_files_recursive(dirpath):
    """
    Recursively get all PDF filenames with full path from a given directory
    :return list:
    """
    return glob.glob(
        os.path.join(os.path.abspath(dirpath), "**", "*.[pP][dD][fF]"), recursive=True
    )

<IPython.core.display.Javascript object>

In [43]:
# list_files_recursive("../")

<IPython.core.display.Javascript object>