## Here is a project of Python script to manage and organize pages in PDF files.

>Developed by [@edyatl](https://github.com/edyatl) February 2023 <edyatl@yandex.ru>

In [1]:
# Load Jupyter extension for auto correction coding style based on Black Lib
%load_ext nb_black

<IPython.core.display.Javascript object>

### TODO

* ~~Collect information on the total number of pages of each format~~
* ~~Add dimensions of customer's formats converted from mm~~
* ~~Write PDF with pages only one size~~
* ~~Recursively get all PDF filenames with full path from a given directory~~
* ~~Collect into the Reader content of several pdf files~~

In [2]:
import os
import glob
from PyPDF2 import PdfReader, PdfWriter


<IPython.core.display.Javascript object>

In [3]:
gost_size = [
    ["A0", 1189, 841],
    ["A1", 841, 594],
    ["A2", 594, 420],
    ["A3", 420, 297],
    ["A4", 297, 210],
    ["A4х3", 630, 297],
    ["A4х4", 841, 297],
    ["A4х5", 1051, 297],
    ["A4х6", 1261, 297],
    ["A4х7", 1471, 297],
    ["A4х8", 1682, 297],
    ["A4х9", 1892, 297],
    ["A3х3", 891, 420],
    ["A3х4", 1189, 420],
    ["A3х5", 1486, 420],
    ["A3х6", 1783, 420],
    ["A3х7", 2080, 420],
    ["A2х3", 1261, 594],
    ["A2х4", 1682, 594],
    ["A2х5", 2102, 594],
    ["A1х3", 1783, 841],
    ["A1х3", 2378, 841],
    ["A0х2", 1682, 1189],
    ["A0х3", 2523, 1189],
]
PaperSizes = {  # add new: ensure that first number is <= second number
    "A0": [2384, 3370],
    "A1": [1684, 2384],
    "A2": [1190, 1684],
    "A3": [842, 1190],
    "A4": [595, 842],
    "A5": [420, 595],
    "A6": [298, 420],
    "A7": [210, 298],
    "A8": [148, 210],
    "B0": [2835, 4008],
    "B1": [2004, 2835],
    "B2": [1417, 2004],
    "B3": [1001, 1417],
    "B4": [709, 1001],
    "B5": [499, 709],
    "B6": [354, 499],
    "B7": [249, 354],
    "B8": [176, 249],
    "B9": [125, 176],
    "B10": [88, 125],
    "C2": [1837, 578],
    "C3": [578, 919],
    "C4": [919, 649],
    "C5": [649, 459],
    "C6": [459, 323],
    "Invoice": [396, 612],
    "Executive": [522, 756],
    "Letter": [612, 792],
    "Legal": [612, 1008],
    "Ledger": [792, 1224],
    "A4х3": [842, 1785],
    "A4х4": [842, 2383],
    "A4х5": [842, 2978],
    "A4х6": [842, 3573],
    "A4х7": [842, 4168],
    "A4х8": [842, 4766],
    "A4х9": [842, 5361],
    "A3х3": [1190, 2526],
    "A3х4": [1190, 3371],
    "A3х5": [1190, 4213],
    "A3х6": [1190, 5055],
    "A3х7": [1190, 5897],
    "A2х3": [1684, 3573],
    "A2х4": [1684, 4766],
    "A2х5": [1684, 5956],
    "A1х3": [2384, 5055],
    "A1х3": [2384, 6742],
    "A0х2": [3370, 4768],
    "A0х3": [3370, 7152],
}
# Add dimensions of customer's formats converted from mm
# for fmt in gost_size:
#     if len(fmt[0]) > 2 and fmt[0][1:3] == "0х":
#         print(
#             f"'{fmt[0]}': [{PaperSizes['A0'][1]}, {fmt[1]/gost_size[0][2]*PaperSizes['A0'][0]:.0f}],"
#         )

<IPython.core.display.Javascript object>

In [4]:
def list_files_recursive(dirpath):
    """
    Recursively get all PDF filenames with full path from a given directory

    :param dirpath: str
        The directory to start find PDF files recursively with all nested subdirectories.
    :return: list
        Returns a list of PDF filenames with full paths.
    """
    return glob.glob(
        os.path.join(os.path.abspath(dirpath), "**", "*.[pP][dD][fF]"), recursive=True
    )

<IPython.core.display.Javascript object>

In [5]:
def collect_pdf_content(file_paths: list) -> list:
    """
    Collect into the Reader content of several pdf files.

    :param file_paths: list
        A list of PDF filenames with full paths.
    :return: list
        Returns the list of pages from all PDF files received from `file_paths` param.
    """
    all_pages = []
    for file_path in file_paths:
        reader = PdfReader(file_path)
        all_pages.extend(reader.pages)
    return all_pages

<IPython.core.display.Javascript object>

In [6]:
def find_fmt(iwidth: float, iheight: float, orient: bool = True) -> str:
    """
    Determine the page format from the `PaperSizes` dictionary, based on the given width and height.

    :param iwidth: float
        Input width.
    :param iheight: float
        Input height.
    :param orient: bool, optional
        Determine the paper orientation (default is True).
    :return: str
        Returns the standard page format or the approximately closest format.
    """
    width: int = int(round(iwidth, 0))
    height: int = int(round(iheight, 0))

    w1, h1 = (width, height) if iwidth <= iheight else (height, width)

    str_width, str_height = str(w1), str(h1)

    distances = {
        (abs(w1 - s[0]) + abs(h1 - s[1])): key for key, s in PaperSizes.items()
    }
    closest_distance = min(distances.keys())
    paper_size_key = distances[closest_distance]

    if iwidth <= iheight:
        paper_orientation = paper_size_key + "-P" if orient else paper_size_key
        paper_size_str = (
            f"{PaperSizes[paper_size_key][0]}x{PaperSizes[paper_size_key][1]}"
        )
    else:
        paper_orientation = paper_size_key + "-L" if orient else paper_size_key
        paper_size_str = (
            f"{PaperSizes[paper_size_key][1]}x{PaperSizes[paper_size_key][0]}"
        )

    if closest_distance >= 0 and closest_distance <= 2:
        return paper_orientation

    return f"{str_width}x{str_height} ~{paper_orientation}({paper_size_str})"

<IPython.core.display.Javascript object>

In [7]:
def get_format_info(pages: list) -> dict:
    """
    Collect information of the total number of pages for each format into a dict.

    :param pages: list
        A list of PDF pages.
    :return: dict
        Returns the dictionary where page format as a key and their amount as a value.
    """
    format_info = {}
    for pg in pages:
        fmt = find_fmt(pg.mediabox.width, pg.mediabox.height, False)
        if fmt in format_info:
            format_info[fmt] += 1
        else:
            format_info[fmt] = 1
    return format_info

<IPython.core.display.Javascript object>

In [8]:
def draw_format_info_tab(format_info: dict):
    """
    Draw a table with pages formats and their amount from a given dictionary.

    :param format_info: dict
        A dictionary where page format as a key and their amount as a value.
    """
    print("{:>27} {:>9}\n{}  --------".format("Format", "Count", ("-" * 27)))
    for fmt, cnt in sorted(format_info.items()):
        print(f"{fmt:>27} {cnt:>9}")

<IPython.core.display.Javascript object>

In [9]:
def subwrite_limit_fmt_file(
    fmt: str, pages: list, start: int, stop: int, i: int, meta: dict
):
    """
    Write PDF file with pages only one size (format) in group of files
    splited with indexis for a limit number of pages.
    Subfunction of write_fmt_file()

    :param fmt: str
        A page format as string value.
    :param pages: list
        A list of PDF pages.
    :param start: int
        Starting index of pages list slice.
    :param stop: int
        Ending index of pages list slice.
    :param i: int
        Increment of current iteration to add index to a filename.
    :param meta: dict
        Dictionary with metadata for output PDF file.
    """
    subwriter = PdfWriter()
    for wpg in pages[start:stop]:
        subwriter.add_page(wpg)

    subwriter.add_metadata(meta)

    # Save the new PDF to a file with index
    with open(f"current_dir_{fmt}_pdf-{i}.pdf", "wb") as f:
        subwriter.write(f)
    subwriter.close()

<IPython.core.display.Javascript object>

In [10]:
def write_fmt_file(fmt: str, pages: list, limit: int = 0):
    """
    Write PDF files with pages only one size (format) or
    if `limit` param is given call subfunction subwrite_limit_fmt_file()
    to write splited files with indexis for a limit number of pages.

    :param fmt: str
        A page format as string value.
    :param pages: list
        A list of PDF pages.
    :param limit: int, optional
        Number of maximum alowed pages per one output file.
    """
    writer = PdfWriter()
    for pg in pages:
        if find_fmt(pg.mediabox.width, pg.mediabox.height, False) == fmt:
            writer.add_page(pg)

    # Add the metadata
    metadata = {
        "/Creator": "PDFSort",
        "/Producer": "PDFSort",
    }
    writer.add_metadata(metadata)

    if limit > 0 and limit < len(writer.pages):
        np: int = len(writer.pages)
        fnum: int = np // limit + 1
        start: int = 0
        stop: int = limit
        for i in range(fnum):
            subwrite_limit_fmt_file(
                fmt, writer.pages, start, stop if stop <= np else np, i, metadata
            )
            start += limit
            stop += limit
    else:
        # Save the new PDF to a file
        with open(f"current_dir_{fmt}_pdf.pdf", "wb") as f:
            writer.write(f)
    writer.close()

<IPython.core.display.Javascript object>

In [11]:
# Load PDF file with full path
pdf_src = os.path.join(os.path.abspath("../"), "шаблон/Binder1.pdf")

<IPython.core.display.Javascript object>

In [12]:
# Determine pages formats from file above, count and draw results table
format_info = get_format_info(collect_pdf_content([pdf_src]))
draw_format_info_tab(format_info)

                     Format     Count
---------------------------  --------
                         A0         3
                         A1         1
                         A2         1
                         A4         3


<IPython.core.display.Javascript object>

In [13]:
# Write files for each page format in `format_info` dictionaty
for fmt in format_info:
    print(fmt)
    write_fmt_file(fmt, collect_pdf_content([pdf_src]), 2)

A4
A2
A0
A1


<IPython.core.display.Javascript object>

In [14]:
# Determine pages formats for list of files, count and draw results table
draw_format_info_tab(
    get_format_info(collect_pdf_content(list_files_recursive("../../")))
)

                     Format     Count
---------------------------  --------
       128x128 ~B10(88x125)         3
         72x72 ~B10(88x125)        30
                         A0         6
                         A1         2
                         A2         2
                         A4         6
                     Letter        27


<IPython.core.display.Javascript object>

In [15]:
len(list_files_recursive("../../"))

45

<IPython.core.display.Javascript object>

In [16]:
# Install a pip package in the current Jupyter kernel
# import sys

# !{sys.executable} -m pip install -U PyPDF2

<IPython.core.display.Javascript object>