In [1]:
import os
import fitz  # PyMuPDF
import tempfile
import shutil

def get_all_pdfs(root_dir):
    """
    Recursively collects all PDF file paths from a given root directory.
    
    Args:
        root_dir (str): Path to the root directory.

    Returns:
        list: List of full paths to PDF files.
    """
    pdf_files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for file in filenames:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(dirpath, file))
    return pdf_files


def shrink_pdf_vector(pdf_path, target_size="A4"):
    """
    Shrinks oversized PDF pages by transforming vector content while maintaining header alignment.

    Args:
        pdf_path (str): Path to the original PDF file.
        target_size (str): 'A4' or 'Letter'
    """
    page_sizes = {
        "A4": (595.276, 841.890),
        "Letter": (612.0, 792.0),
    }

    if target_size not in page_sizes:
        raise ValueError("Unsupported page size. Choose 'A4' or 'Letter'.")

    target_width, target_height = page_sizes[target_size]
    
    doc = fitz.open(pdf_path)
    doc_source = fitz.open(pdf_path)
    new_doc = fitz.open()

    for page_number in range(len(doc)):
        page = doc[page_number]
        rect = page.rect

        if rect.width is None or rect.height is None:
            print(f"⚠️ Skipping page with invalid dimensions in {pdf_path}")
            continue

        scale_x = min(target_width / rect.width, 1.0)
        scale_y = min(target_height / rect.height, 1.0)
        scale = min(scale_x, scale_y)

        if scale < 1.0:
            new_page = new_doc.new_page(width=target_width, height=target_height)
            trans = fitz.Matrix(scale, scale)

            dx = (target_width - rect.width * scale) / 2  # center horizontally
            dy = 0  # align to top to preserve header position
            move = fitz.Matrix(1, 0, 0, 1, dx, dy)

            full_transform = trans * move
            new_page.show_pdf_page(new_page.rect, doc_source, page_number, full_transform)
        else:
            new_doc.insert_pdf(doc, from_page=page_number, to_page=page_number)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_path = tmp_file.name
    new_doc.save(tmp_path)
    new_doc.close()
    doc.close()
    doc_source.close()

    shutil.move(tmp_path, pdf_path)
    print(f"✅ Vector-shrunk: {pdf_path}")



def main():
    root_folder = r"/home/cdsw/Final extraction"  # Change as needed
    pdf_files = get_all_pdfs(root_folder)

    for pdf in pdf_files:
        try:
            shrink_pdf_vector(pdf, target_size="A4")
        except Exception as e:
            print(f"❌ Failed on {pdf}: {e}")


if __name__ == "__main__":
    main()


✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - UNION LEVEL.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - SWITZERLAND.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - SWEDEN.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - SPAIN.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - SLOVENIA.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - SLOVAKIA.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - ROMANIA.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - PORTUGAL.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - POLAND.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - NORWAY.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - NETHERLANDS.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - MALTA.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - LUXEMBOURG.pdf
✅ Vector-shrunk: /home/cdsw/Final extraction/AMR 2024 - LITHUANIA.pdf
✅ Vector-shrunk: /home/cdsw/Final e