In [None]:
from pathlib import Path
from typing import List, Union
from pypdf import PdfReader, PdfWriter
from loguru import logger # Or use print if logger is not configured in the notebook


def parse_page_ranges(pages: str) -> List[int]:
    """
    Parses a string of page ranges (1-based) and returns a list of 0-based page indices.
    Args:
        pages (str): A string representing page ranges, e.g. "1,3-5,7".
    Returns:
        list[int]: A list of 0-based page indices.
    """
    page_indices = []
    if not isinstance(pages, str):
        logger.warning(f"Invalid input type for pages: {type(pages)}. Expected string.")
        return page_indices

    for part in pages.split(","):
        part = part.strip()
        if not part: continue
        if "-" in part:
            try:
                start_str, end_str = part.split("-", 1)
                start = int(start_str)
                end = int(end_str)
                if start <= 0 or end <= 0 or start > end:
                     logger.warning(f"Skipping invalid page range (start/end <= 0 or start > end): '{part}'")
                     continue
                # Add range, converting to 0-based indices
                page_indices.extend(range(start - 1, end))
            except ValueError:
                 logger.warning(f"Skipping invalid page range format: '{part}'")
        else:
            try:
                page_num = int(part)
                if page_num <= 0:
                    logger.warning(f"Skipping invalid page number (<= 0): '{part}'")
                    continue
                # Add single page, converting to 0-based index
                page_indices.append(page_num - 1)
            except ValueError:
                logger.warning(f"Skipping invalid page number format: '{part}'")

    # Remove duplicates and sort
    unique_indices = sorted(list(set(page_indices)))
    return unique_indices

def extract_pdf_pages(
    input_pdf_path: Union[str, Path],
    output_pdf_path: Union[str, Path],
    pages_to_extract: Union[str, List[int]],
) -> bool:
    """
    Extracts specific pages from a PDF file and saves them to a new PDF file.

    Args:
        input_pdf_path (Union[str, Path]): The path to the source PDF file.
        output_pdf_path (Union[str, Path]): The path for the output PDF file.
        pages_to_extract (Union[str, List[int]]): The pages to extract.
            Can be a list of 0-based indices (e.g., [0, 2, 3, 4, 6])
            or a string like "1,3-5,7" (1-based page numbers).

    Returns:
        bool: True if extraction was successful (at least one page extracted), False otherwise.
    """
    input_path = Path(input_pdf_path)
    output_path = Path(output_pdf_path)

    if not input_path.is_file():
        logger.error(f"Error: Input PDF not found at {input_path}")
        return False

    try:
        reader = PdfReader(str(input_path))
        writer = PdfWriter()
        total_pages = len(reader.pages)

        if isinstance(pages_to_extract, str):
            page_indices = parse_page_ranges(pages_to_extract)
        elif isinstance(pages_to_extract, list) and all(isinstance(i, int) for i in pages_to_extract):
            # Use list of indices directly (assuming 0-based)
            page_indices = sorted(list(set(pages_to_extract))) # Ensure unique and sorted
        else:
            logger.error("pages_to_extract must be a string (e.g., '1,3-5') or a list of integers (0-based indices).")
            return False

        extracted_count = 0
        valid_indices_extracted = []
        for i in page_indices:
            if 0 <= i < total_pages:
                writer.add_page(reader.pages[i])
                extracted_count += 1
                valid_indices_extracted.append(i)
            else:
                logger.warning(
                    f"Page index {i} (page number {i+1}) is out of bounds. "
                    f"The document has {total_pages} pages (indices 0 to {total_pages-1}). Skipping."
                )

        if extracted_count > 0:
            # Ensure output directory exists
            output_path.parent.mkdir(parents=True, exist_ok=True)
            with open(output_path, "wb") as f_out:
                writer.write(f_out)
            logger.info(f"Successfully extracted {extracted_count} pages (indices: {valid_indices_extracted}) to {output_path}")
            return True
        else:
            logger.warning("No valid pages were specified or found to extract. Output file not created.")
            # Optionally remove the output file if it exists and is empty, though write() shouldn't create it if no pages added.
            if output_path.exists():
                 try:
                     if output_path.stat().st_size == 0:
                         output_path.unlink()
                 except OSError as e:
                     logger.warning(f"Could not check/remove empty output file {output_path}: {e}")
            return False

    except Exception as e:
        logger.error(f"An error occurred during PDF extraction: {e}", exc_info=True) # Log traceback
        # Clean up potentially partially written file
        if output_path.exists():
            try:
                output_path.unlink()
                logger.info(f"Removed potentially incomplete output file: {output_path}")
            except OSError as unlink_error:
                logger.warning(f"Could not remove incomplete output file {output_path}: {unlink_error}")
        return False

# disposition_generale 7-19, 252-269
# zones_urbaines 20-196
# zones_a_urbaniser 197-223
# zones_agricoles 224-236
# zones_naturelles_et_forestieres 237-251

# Example Usage (you can add this to another cell in your notebook):
pdf_input = '/Users/florentlin/Documents/Code/plu/data/external/aix_les_bains/plu.pdf'
pdf_output = '/Users/florentlin/Documents/Code/plu/data/external/aix_les_bains/zones_naturelles_et_forestieres.pdf'
pages = '237-251' # Extract pages 1, 3, 4, 5, 10

success = extract_pdf_pages(pdf_input, pdf_output, pages)
if success:
    print(f"Pages extracted successfully to {pdf_output}")
else:
    print("PDF extraction failed.")