In [1]:
import re
import os
from PyPDF2 import PdfReader, PdfWriter


In [2]:
# Function to save a new PDF file with extracted pages
def save_new_pdf(pdf_writer, arrest_number):
    output_dir = "output_pdf"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Create the output file name based on the arrest number
    new_filename = f"{output_dir}/arrest_number_{arrest_number}.pdf"

    # Save the pages to the new PDF
    with open(new_filename, "wb") as output_pdf:
        pdf_writer.write(output_pdf)

    print(f"Saved: {new_filename}")

# Function to extract the arrest number from the text
def extract_arrest_number(text):
    # Use regular expression to find the arrest number in different formats
    match = re.search(r'Arrêt\s*[nN]°\s*(\d+)', text)
    if match:
        return match.group(1)  # Return the number part
    return None

# Function to split the PDF based on "Arrêt n°"
def split_pdf_by_arret(pdf_to_split):
    # Open the PDF file
    reader = PdfReader(pdf_to_split)
    pdf_writer = None  # Initialize the PDF writer for each "Arrêt"
    arrest_number = None  # Keep track of the current arrest number

    # Iterate through each page in the PDF
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text = page.extract_text()

        # Detect the beginning of a new "Arrêt" using "Arrêt n°"
        if re.search(r'Arrêt\s*[nN]°\s*\d+', text):
            # If we have a writer, save the previous arrest
            if pdf_writer and arrest_number:
                save_new_pdf(pdf_writer, arrest_number)

            # Extract the new arrest number and start a new PDF writer
            arrest_number = extract_arrest_number(text)
            pdf_writer = PdfWriter()

        # Add the current page to the current arrest's writer
        if pdf_writer:
            pdf_writer.add_page(page)

    # Save the last arrest if there are any remaining pages
    if pdf_writer and arrest_number:
        save_new_pdf(pdf_writer, arrest_number)


In [4]:
# Set the path to your PDF file
pdf_path = "D:/all-arrest-2009.docx.pdf"
split_pdf_by_arret(pdf_path)
print("Done")

Saved: output_pdf/arrest_number_01.pdf
Saved: output_pdf/arrest_number_02.pdf
Saved: output_pdf/arrest_number_04.pdf
Saved: output_pdf/arrest_number_05.pdf
Saved: output_pdf/arrest_number_06.pdf
Saved: output_pdf/arrest_number_07.pdf
Saved: output_pdf/arrest_number_08.pdf
Saved: output_pdf/arrest_number_14.pdf
Saved: output_pdf/arrest_number_18.pdf
Saved: output_pdf/arrest_number_27.pdf
Saved: output_pdf/arrest_number_29.pdf
Saved: output_pdf/arrest_number_30.pdf
Saved: output_pdf/arrest_number_35.pdf
Saved: output_pdf/arrest_number_37.pdf
Saved: output_pdf/arrest_number_41.pdf
Saved: output_pdf/arrest_number_42.pdf
Saved: output_pdf/arrest_number_43.pdf
Saved: output_pdf/arrest_number_44.pdf
Saved: output_pdf/arrest_number_45.pdf
Saved: output_pdf/arrest_number_46.pdf
Saved: output_pdf/arrest_number_57.pdf
Saved: output_pdf/arrest_number_66.pdf
Saved: output_pdf/arrest_number_67.pdf
Saved: output_pdf/arrest_number_68.pdf
Saved: output_pdf/arrest_number_69.pdf
Saved: output_pdf/arrest_