<a href="https://colab.research.google.com/github/drfperez/utilities/blob/main/PDFcompressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

BASE_DIR = "/content/drive/MyDrive"

INPUT_DIR  = os.path.join(BASE_DIR, "pdf_input")
OUTPUT_DIR = os.path.join(BASE_DIR, "pdf_output")

os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Input folder :", INPUT_DIR)
print("Output folder:", OUTPUT_DIR)

Input folder : /content/drive/MyDrive/pdf_input
Output folder: /content/drive/MyDrive/pdf_output


In [None]:
pdfs = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")]
print("PDFs found:", pdfs if pdfs else "‚ùå none yet")

PDFs found: ‚ùå none yet


In [None]:
# Optional: force Colab to see the latest files
import os
INPUT_DIR = "/content/drive/MyDrive/pdf_input"

# List files
pdfs = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")]
print("Detected PDFs:", pdfs)

Detected PDFs: ['openair_book_complete.pdf']


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:

# ------------------ INSTALL ------------------
!apt-get update -qq && apt-get install -y -qq ghostscript > /dev/null

# ------------------ IMPORTS ------------------
from google.colab import drive
import os, subprocess, time

# ------------------ MOUNT DRIVE ------------------
drive.mount('/content/drive')

# ------------------ FOLDERS ------------------
INPUT_DIR  = "/content/drive/MyDrive/pdf_input"
OUTPUT_DIR = "/content/drive/MyDrive/pdf_output"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------ USER SETTINGS ------------------
START_DPI = 150        # Initial DPI for images
TARGET_MB = None        # Set a number for a size target (optional)
USE_GRAYSCALE = True    # Try grayscale first for maximum compression

# ------------------ HELPER FUNCTIONS ------------------
def compress(in_pdf, out_pdf, preset, dpi=None, gray=False):
    cmd = [
        "gs",
        "-sDEVICE=pdfwrite",
        "-dCompatibilityLevel=1.4",
        f"-dPDFSETTINGS={preset}",
        "-dNOPAUSE",
        "-dQUIET",
        "-dBATCH"
    ]
    if dpi:
        cmd += [
            f"-dColorImageResolution={dpi}",
            f"-dGrayImageResolution={dpi}",
            f"-dMonoImageResolution={dpi}"
        ]
    if gray:
        cmd += [
            "-sColorConversionStrategy=Gray",
            "-dProcessColorModel=/DeviceGray",
            "-sColorConversionStrategyForImages=Gray"
        ]
    cmd += [f"-sOutputFile={out_pdf}", in_pdf]
    subprocess.run(cmd, check=True)

def compress_max(in_pdf, base_name):
    qualities = ["/screen", "/ebook", "/printer", "/prepress"]
    dpis = [START_DPI, 150, 96, 72, 40]
    gray_options = [True, False] if USE_GRAYSCALE else [False]
    target_bytes = TARGET_MB * 1024 * 1024 if TARGET_MB else None
    best = None

    for gray in gray_options:
        for preset in qualities:
            for dpi in dpis:
                out_file = os.path.join(OUTPUT_DIR,
                    f"{base_name}_{preset[1:]}_{dpi}_{'gray' if gray else 'color'}.pdf")
                compress(in_pdf, out_file, preset, dpi, gray)
                size = os.path.getsize(out_file)
                if best is None or size < best[1]:
                    best = (out_file, size, preset, dpi, gray)
                if target_bytes and size <= target_bytes:
                    return best
    return best

# ------------------ MAIN ------------------
pdfs = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")]
if not pdfs:
    raise RuntimeError(f"No PDFs found in {INPUT_DIR}. Upload PDF(s) and rerun.")

for pdf in pdfs:
    input_pdf = os.path.join(INPUT_DIR, pdf)
    base_name = os.path.splitext(pdf)[0]
    print(f"\nProcessing: {pdf}")
    result = compress_max(input_pdf, base_name)
    out_pdf, size, preset, dpi, gray = result
    print(f"‚úÖ Done: {out_pdf}")
    print(f"Preset: {preset}, DPI: {dpi}, Grayscale: {gray}")
    print(f"Original size: {round(os.path.getsize(input_pdf)/1024/1024,2)} MB")
    print(f"Compressed size: {round(size/1024/1024,2)} MB")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Processing: openair_book_complete.pdf


KeyboardInterrupt: 

In [None]:

# ------------------ INSTALL DEPENDENCIES ------------------
!apt-get update -qq && apt-get install -y -qq ghostscript > /dev/null
!pip install -q pikepdf ipywidgets

# ------------------ IMPORTS ------------------
from google.colab import drive
import os, subprocess, time
import pikepdf
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# ------------------ MOUNT DRIVE ------------------
drive.mount('/content/drive')

# ------------------ FOLDERS ------------------
BASE_DIR = "/content/drive/MyDrive"
INPUT_DIR = os.path.join(BASE_DIR, "pdf_input")
OUTPUT_DIR = os.path.join(BASE_DIR, "pdf_output")
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------ USER SETTINGS ------------------
# Compression settings
START_DPI = 150        # Initial DPI for images
USE_GRAYSCALE = True   # Try grayscale first
TARGET_MB = None       # Set number to compress to a target size (optional)

# ------------------ HELPER FUNCTIONS ------------------
def compress_pdf(input_pdf, base_name):
    qualities = ["/screen", "/ebook", "/printer", "/prepress"]
    dpis = [START_DPI, 150, 96, 72, 40]
    gray_options = [True, False] if USE_GRAYSCALE else [False]
    target_bytes = TARGET_MB * 1024 * 1024 if TARGET_MB else None
    best = None

    for gray in gray_options:
        for preset in qualities:
            for dpi in dpis:
                out_file = os.path.join(OUTPUT_DIR,
                    f"{base_name}_{preset[1:]}_{dpi}_{'gray' if gray else 'color'}.pdf")
                cmd = [
                    "gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4",
                    f"-dPDFSETTINGS={preset}", "-dNOPAUSE", "-dQUIET", "-dBATCH",
                    f"-dColorImageResolution={dpi}",
                    f"-dGrayImageResolution={dpi}",
                    f"-dMonoImageResolution={dpi}",
                    f"-sOutputFile={out_file}", input_pdf
                ]
                if gray:
                    cmd += ["-sColorConversionStrategy=Gray",
                            "-dProcessColorModel=/DeviceGray",
                            "-sColorConversionStrategyForImages=Gray"]
                subprocess.run(cmd, check=True)
                size = os.path.getsize(out_file)
                if best is None or size < best[1]:
                    best = (out_file, size, preset, dpi, gray)
                if target_bytes and size <= target_bytes:
                    return best
    return best

def merge_pdfs(pdf_list, out_name):
    merged = pikepdf.Pdf.new()
    for pdf in pdf_list:
        with pikepdf.open(pdf) as p:
            merged.pages.extend(p.pages)
    merged.save(out_name)

def split_pdf(input_pdf, pages=None):
    """
    pages=None ‚Üí split every page
    pages=[0,2,3] ‚Üí extract only these 0-based pages
    """
    with pikepdf.open(input_pdf) as pdf:
        if pages is None:
            # Split every page
            for i, page in enumerate(pdf.pages):
                out_file = os.path.join(OUTPUT_DIR,
                                        f"{os.path.splitext(os.path.basename(input_pdf))[0]}_page_{i+1}.pdf")
                single = pikepdf.Pdf.new()
                single.pages.append(page)
                single.save(out_file)
        else:
            out_file = os.path.join(OUTPUT_DIR,
                                    f"{os.path.splitext(os.path.basename(input_pdf))[0]}_extract.pdf")
            new_pdf = pikepdf.Pdf.new()
            for p in pages:
                new_pdf.pages.append(pdf.pages[p])
            new_pdf.save(out_file)

def parse_ranges(ranges_str, max_pages):
    """Parse string like 1-3,5 ‚Üí 0-based page numbers"""
    pages = []
    for part in ranges_str.split(","):
        if "-" in part:
            a,b = part.split("-")
            pages.extend(range(int(a)-1,int(b)))
        else:
            pages.append(int(part)-1)
    return [p for p in pages if 0 <= p < max_pages]

# ------------------ WIDGETS ------------------
action_dropdown = widgets.Dropdown(
    options=[
        ("Compress PDF", "compress"),
        ("Merge PDFs", "merge"),
        ("Split PDF (all pages)", "split_all"),
        ("Split PDF (page ranges)", "split_ranges"),
        ("Extract / reorder pages", "extract")
    ],
    description="Action:"
)
ranges_text = widgets.Text(value="1-3,5", description="Pages / ranges:")
run_button = widgets.Button(description="Run", button_style="success")
out = widgets.Output()

display(action_dropdown, ranges_text, run_button, out)

def on_run(b):
    with out:
        clear_output(wait=True)
        pdf_files = [os.path.join(INPUT_DIR,f) for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")]
        if not pdf_files:
            print("‚ùå No PDFs in pdf_input folder.")
            return
        ts = int(time.time())

        if action_dropdown.value=="compress":
            for pdf in pdf_files:
                base = os.path.splitext(os.path.basename(pdf))[0]
                result = compress_pdf(pdf, base)
                out_file, size, preset, dpi, gray = result
                print(f"‚úÖ Compressed: {out_file}")
                print(f"Preset={preset}, DPI={dpi}, Grayscale={gray}, Size={round(size/1024/1024,2)} MB")

        elif action_dropdown.value=="merge":
            out_name = os.path.join(OUTPUT_DIR, f"merged_{ts}.pdf")
            merge_pdfs(pdf_files, out_name)
            print(f"‚úÖ Merged PDF saved as: {out_name}")

        elif action_dropdown.value=="split_all":
            split_pdf(pdf_files[0])
            print(f"‚úÖ Split every page from {pdf_files[0]} into separate PDFs in output folder")

        elif action_dropdown.value=="split_ranges":
            pages = parse_ranges(ranges_text.value, 1000)  # large number, will auto-clip
            split_pdf(pdf_files[0], pages)
            print(f"‚úÖ Extracted ranges {ranges_text.value} from {pdf_files[0]}")

        elif action_dropdown.value=="extract":
            pages = parse_ranges(ranges_text.value, 1000)
            split_pdf(pdf_files[0], pages)
            print(f"‚úÖ Extracted/reordered pages {ranges_text.value} from {pdf_files[0]}")

run_button.on_click(on_run)

print("üìå Instructions:")
print("1) Put PDFs in pdf_input folder.")
print("2) Select an action from dropdown.")
print("3) If using ranges/extract, enter pages like 1-3,5,7-9.")
print("4) Click Run. Output saved in pdf_output folder.")

In [None]:
# ------------------ FAST PDF COMPRESSOR ------------------
!apt-get update -qq && apt-get install -y -qq ghostscript > /dev/null
from google.colab import drive
import os, subprocess

# Mount Drive
drive.mount('/content/drive')

INPUT_DIR  = "/content/drive/MyDrive/pdf_input"
OUTPUT_DIR = "/content/drive/MyDrive/pdf_output"
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Settings
DPI = 96             # Fast, decent quality
PRESET = "/screen"   # Fastest compression
USE_GRAYSCALE = False # Set True for scanned PDFs

# Process all PDFs
pdfs = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")]
if not pdfs:
    raise RuntimeError(f"No PDFs found in {INPUT_DIR}")

for pdf in pdfs:
    input_pdf = os.path.join(INPUT_DIR, pdf)
    base = os.path.splitext(pdf)[0]
    out_pdf = os.path.join(OUTPUT_DIR, f"{base}_compressed.pdf")

    cmd = [
        "gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4",
        f"-dPDFSETTINGS={PRESET}", "-dNOPAUSE", "-dQUIET", "-dBATCH",
        f"-dColorImageResolution={DPI}",
        f"-dGrayImageResolution={DPI}",
        f"-dMonoImageResolution={DPI}",
        f"-sOutputFile={out_pdf}",
        input_pdf
    ]
    if USE_GRAYSCALE:
        cmd += ["-sColorConversionStrategy=Gray",
                "-dProcessColorModel=/DeviceGray",
                "-sColorConversionStrategyForImages=Gray"]

    subprocess.run(cmd, check=True)
    print(f"‚úÖ Compressed: {out_pdf}")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Compressed: /content/drive/MyDrive/pdf_output/openair_book_complete_compressed.pdf
