In [0]:
#%run ./init

In [0]:
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
from IPython.display import display

# Helper function to convert a DBFS URI to a local path
def dbfs_to_local_path(dbfs_path):
    # Converts "dbfs:/mnt/..." to "/dbfs/mnt/..."
    if dbfs_path.startswith("dbfs:/"):
        return "/dbfs/" + dbfs_path[len("dbfs:/"):]
    return dbfs_path

def convert_pdf_to_images(pdf_path, dpi=300):
    """
    Convert each page of a PDF into a list of PIL Images.
    """
    pages = convert_from_path(pdf_path, dpi=dpi)
    return pages

def show_image_in_notebook(pil_image, title=""):
    """
    Display a PIL image inline.
    """
    display(pil_image)

def detect_small_blue_boxes(
    pil_image,
    lower_blue=np.array([100, 100, 50]),
    upper_blue=np.array([130, 255, 255]),
    min_area=1000,
    max_area=200000,
    max_width=None,
    max_height=200
):
    """
    Detect distinct blue boxes (headers/cells) in the image using:
      1) HSV thresholding for blue
      2) Small morphological open to remove noise
      3) (Optional) minimal morphological close
      4) Connected Components to separate side-by-side regions

    Returns:
        boxes (list): List of bounding boxes (x, y, w, h) for each distinct region.
    """
    img_bgr = np.array(pil_image.convert("RGB"))
    img_bgr = cv2.cvtColor(img_bgr, cv2.COLOR_RGB2BGR)

    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    mask = cv2.inRange(hsv, lower_blue, upper_blue)

    # Small morphological open to remove speckle noise
    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    mask_opened = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_open, iterations=1)

    # Very small morphological close (optional) to fill tiny gaps
    kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    mask_closed = cv2.morphologyEx(mask_opened, cv2.MORPH_CLOSE, kernel_close, iterations=1)

    # Connected Components
    num_labels, labels = cv2.connectedComponents(mask_closed)

    boxes = []
    for label in range(1, num_labels):
        ys, xs = np.where(labels == label)
        x_min, x_max = np.min(xs), np.max(xs)
        y_min, y_max = np.min(ys), np.max(ys)
        w = x_max - x_min + 1
        h = y_max - y_min + 1
        area = w * h

        if area < min_area or area > max_area:
            continue
        if h > max_height:
            continue
        if max_width is not None and w > max_width:
            continue

        boxes.append((x_min, y_min, w, h))

    # Sort top-to-bottom, then left-to-right
    boxes.sort(key=lambda b: (b[1], b[0]))
    return boxes

def filter_header_boxes(boxes, target_height=31, tolerance=5):
    """
    Filter out header boxes based on a target height.
    """
    header_boxes = []
    for (x, y, w, h) in boxes:
        if abs(h - target_height) <= tolerance:
            header_boxes.append((x, y, w, h))
    header_boxes.sort(key=lambda b: (b[1], b[0]))
    return header_boxes

def crop_sections_by_headers(pil_image, headers, row_tolerance=10):
    """
    Crop the page into sections based on header positions, 
    including one extra "pre-header" section from y=0 to the first header.
    """
    width, height = pil_image.width, pil_image.height
    sections = []
    section_coords = []

    if not headers:
        # If no headers, return the entire page as one section
        sections.append(pil_image)
        section_coords.append((0, 0, width, height))
        return sections, section_coords

    # Group headers into rows
    rows = []
    current_row = [headers[0]]
    for i in range(1, len(headers)):
        x, y, w, h = headers[i]
        px, py, pw, ph = headers[i - 1]
        if abs(y - py) <= row_tolerance:
            current_row.append(headers[i])
        else:
            rows.append(current_row)
            current_row = [headers[i]]
    if current_row:
        rows.append(current_row)

    # For each row, figure out top boundary
    row_tops = []
    for row_headers in rows:
        row_top = min(hdr[1] for hdr in row_headers)
        row_tops.append(row_top)
    row_tops.append(height)  # last row extends to bottom

    # "Pre-header" section from top of page down to first row
    first_row_top = row_tops[0]
    if first_row_top > 0:
        pre_section = pil_image.crop((0, 0, width, first_row_top))
        sections.append(pre_section)
        section_coords.append((0, 0, width, first_row_top))

    # Create sections row by row
    for r_idx, row_headers in enumerate(rows):
        this_row_top = row_tops[r_idx]
        next_row_top = row_tops[r_idx + 1]

        # Sort headers in this row left-to-right
        row_headers.sort(key=lambda b: b[0])

        # Split horizontally for each header
        for j in range(len(row_headers)):
            x, y, w, h = row_headers[j]
            left = x
            if j < len(row_headers) - 1:
                right = row_headers[j + 1][0]
            else:
                right = width

            cropped = pil_image.crop((left, this_row_top, right, next_row_top))
            sections.append(cropped)
            section_coords.append((left, this_row_top, right, next_row_top))

    return sections, section_coords

def crop_and_display_save_boxes(pil_image, boxes, output_folder_dbfs, page_index, prefix):
    """
    Crop each detected box from the image, display it, and save to DBFS.
    """
    # Create directory on DBFS
    dbutils.fs.mkdirs(output_folder_dbfs)
    # Convert DBFS path to local path for saving files
    output_folder_local = dbfs_to_local_path(output_folder_dbfs)

    for i, (x, y, w, h) in enumerate(boxes):
        cropped = pil_image.crop((x, y, x + w, y + h))
        print(f"{prefix} Box {i+1} at (x={x}, y={y}, w={w}, h={h})")
        show_image_in_notebook(cropped)
        filename = f"page_{page_index+1}_{prefix.lower().replace(' ', '_')}_box_{i+1}.png"
        save_path = f"{output_folder_local}/{filename}"
        cropped.save(save_path)
        print(f"Saved {prefix} cropped image to: {save_path}\n")

def main():
    # Paths (adjust to your environment)
    # Use DBFS URIs for directory creation and local paths for saving images
    pdf_path = "/dbfs/mnt/mini-proj-dd/DDR.pdf"  # PDF path can remain as is if accessible
    output_folder_boxes_dbfs = "dbfs:/mnt/mini-proj-dd/cropped_blue_boxes"
    output_folder_sections_dbfs = "dbfs:/mnt/mini-proj-dd/cropped_sections"
    
    # Convert PDF pages to images
    pages = convert_pdf_to_images(pdf_path, dpi=300)
    
    for page_index, pil_image in enumerate(pages):
        print(f"\n--- Processing Page {page_index+1} ---")

        # Step 1: Detect distinct blue boxes
        boxes = detect_small_blue_boxes(
            pil_image,
            lower_blue=np.array([100, 100, 50]),
            upper_blue=np.array([130, 255, 255]),
            min_area=1000,
            max_area=200000,
            max_width=None,
            max_height=200
        )
        print(f"Detected {len(boxes)} blue boxes on Page {page_index+1}.")

        # Draw bounding boxes for debugging
        debug_img = np.array(pil_image.convert("RGB"))
        for (x, y, w, h) in boxes:
            cv2.rectangle(debug_img, (x, y), (x + w, y + h), (255, 0, 0), 2)
        debug_pil = Image.fromarray(debug_img)
        print("Displaying bounding boxes on the page:")
        show_image_in_notebook(debug_pil)

        # Save debug image to DBFS
        dbutils.fs.mkdirs(output_folder_boxes_dbfs)
        output_folder_boxes_local = dbfs_to_local_path(output_folder_boxes_dbfs)
        boxes_image_path = f"{output_folder_boxes_local}/page_{page_index+1}_blue_boxes.png"
        debug_pil.save(boxes_image_path)
        print(f"Saved blue boxes image to: {boxes_image_path}\n")

        # Step 2: Filter out header boxes (height ~31 px)
        header_boxes = filter_header_boxes(boxes, target_height=31, tolerance=5)
        print(f"Filtered {len(header_boxes)} header boxes on Page {page_index+1}.")

        # Crop & save each header box
        crop_and_display_save_boxes(pil_image, header_boxes, output_folder_boxes_dbfs, page_index, prefix="Header")

        # Step 3: Crop sections under each header + the "pre-header" top slice
        sections, section_coords = crop_sections_by_headers(pil_image, header_boxes, row_tolerance=10)
        print(f"Cropped {len(sections)} sections from Page {page_index+1}.")

        # Save each section to DBFS
        dbutils.fs.mkdirs(output_folder_sections_dbfs)
        output_folder_sections_local = dbfs_to_local_path(output_folder_sections_dbfs)
        for i, section in enumerate(sections):
            print(f"Section {i+1} coords: {section_coords[i]}")
            show_image_in_notebook(section)
            section_filename = f"{output_folder_sections_local}/page_{page_index+1}_section_{i+1}.png"
            section.save(section_filename)
            print(f"Saved section image to: {section_filename}\n")

if __name__ == "__main__":
    main()


In [0]:
# import os
# import cv2
# import numpy as np
# from pdf2image import convert_from_path
# from PIL import Image
# from IPython.display import display

# def convert_pdf_to_images(pdf_path, dpi=300):
#     """
#     Convert each page of a PDF into a list of PIL Images.
#     """
#     pages = convert_from_path(pdf_path, dpi=dpi)
#     return pages

# def show_image_in_notebook(pil_image, title=""):
#     """
#     Display a PIL image inline.
#     """
#     display(pil_image)

# def detect_small_blue_boxes(
#     pil_image,
#     lower_blue=np.array([100, 100, 50]),
#     upper_blue=np.array([130, 255, 255]),
#     min_area=1000,
#     max_area=200000,
#     max_width=None,
#     max_height=200
# ):
#     """
#     Detect distinct blue boxes (headers/cells) in the image using:
#       1) HSV thresholding for blue
#       2) Small morphological open to remove noise
#       3) (Optional) minimal morphological close
#       4) Connected Components to separate side-by-side regions

#     Returns:
#         boxes (list): List of bounding boxes (x, y, w, h) for each distinct region.
#     """
#     img_bgr = np.array(pil_image.convert("RGB"))
#     img_bgr = cv2.cvtColor(img_bgr, cv2.COLOR_RGB2BGR)

#     hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
#     mask = cv2.inRange(hsv, lower_blue, upper_blue)

#     # Small morphological open to remove speckle noise
#     kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#     mask_opened = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_open, iterations=1)

#     # Very small morphological close (optional) to fill tiny gaps
#     kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#     mask_closed = cv2.morphologyEx(mask_opened, cv2.MORPH_CLOSE, kernel_close, iterations=1)

#     # Connected Components
#     num_labels, labels = cv2.connectedComponents(mask_closed)

#     boxes = []
#     for label in range(1, num_labels):
#         ys, xs = np.where(labels == label)
#         x_min, x_max = np.min(xs), np.max(xs)
#         y_min, y_max = np.min(ys), np.max(ys)
#         w = x_max - x_min + 1
#         h = y_max - y_min + 1
#         area = w * h

#         if area < min_area or area > max_area:
#             continue
#         if h > max_height:
#             continue
#         if max_width is not None and w > max_width:
#             continue

#         boxes.append((x_min, y_min, w, h))

#     # Sort top-to-bottom, then left-to-right
#     boxes.sort(key=lambda b: (b[1], b[0]))
#     return boxes

# def filter_header_boxes(boxes, target_height=31, tolerance=5):
#     """
#     Filter out header boxes based on a target height.
#     """
#     header_boxes = []
#     for (x, y, w, h) in boxes:
#         if abs(h - target_height) <= tolerance:
#             header_boxes.append((x, y, w, h))
#     header_boxes.sort(key=lambda b: (b[1], b[0]))
#     return header_boxes

# def crop_sections_by_headers(pil_image, headers, row_tolerance=10):
#     """
#     Crop the page into sections based on header positions, 
#     including one extra "pre-header" section from y=0 to the first header.

#     1) Group headers by "rows" if their y-coordinates are within `row_tolerance`.
#     2) For each row, sort by x (left-to-right).
#     3) The vertical boundary is from this row's top to the next row's top
#        (or the bottom of the page for the last row).
#     4) Horizontally, we split from each header's x to the next header's x
#        (or page right edge if last in row).
#     5) If the top of the first row is > 0, we also create a "pre-header" section
#        from y=0 to y=that row's top.
#     """
#     width, height = pil_image.width, pil_image.height
#     sections = []
#     section_coords = []

#     if not headers:
#         # If no headers, return the entire page as one section
#         sections.append(pil_image)
#         section_coords.append((0, 0, width, height))
#         return sections, section_coords

#     # 1) Group headers into rows
#     rows = []
#     current_row = [headers[0]]
#     for i in range(1, len(headers)):
#         x, y, w, h = headers[i]
#         px, py, pw, ph = headers[i - 1]
#         if abs(y - py) <= row_tolerance:
#             current_row.append(headers[i])
#         else:
#             rows.append(current_row)
#             current_row = [headers[i]]
#     if current_row:
#         rows.append(current_row)

#     # For each row, figure out top boundary
#     row_tops = []
#     for row_headers in rows:
#         row_top = min(hdr[1] for hdr in row_headers)  # min y in that row
#         row_tops.append(row_top)
#     row_tops.append(height)  # last row extends to bottom

#     # --- NEW PART: "Pre-header" section from top of page down to first row ---
#     first_row_top = row_tops[0]
#     if first_row_top > 0:
#         # Crop everything from y=0 to y=first_row_top
#         pre_section = pil_image.crop((0, 0, width, first_row_top))
#         sections.append(pre_section)
#         section_coords.append((0, 0, width, first_row_top))

#     # 2) Now create sections row by row
#     for r_idx, row_headers in enumerate(rows):
#         this_row_top = row_tops[r_idx]
#         next_row_top = row_tops[r_idx + 1]

#         # Sort headers in this row left->right
#         row_headers.sort(key=lambda b: b[0])

#         # 3) Split horizontally for each header
#         for j in range(len(row_headers)):
#             x, y, w, h = row_headers[j]
#             left = x
#             if j < len(row_headers) - 1:
#                 right = row_headers[j + 1][0]
#             else:
#                 right = width

#             cropped = pil_image.crop((left, this_row_top, right, next_row_top))
#             sections.append(cropped)
#             section_coords.append((left, this_row_top, right, next_row_top))

#     return sections, section_coords

# def crop_and_display_save_boxes(pil_image, boxes, output_folder, page_index, prefix):
#     """
#     Crop each detected box from the image, display it, and save to disk.
#     """
#     if not os.path.exists(output_folder):
#         os.makedirs(output_folder)

#     for i, (x, y, w, h) in enumerate(boxes):
#         cropped = pil_image.crop((x, y, x + w, y + h))
#         print(f"{prefix} Box {i+1} at (x={x}, y={y}, w={w}, h={h})")
#         show_image_in_notebook(cropped)
#         filename = f"page_{page_index+1}_{prefix.lower().replace(' ', '_')}_box_{i+1}.png"
#         save_path = os.path.join(output_folder, filename)
#         cropped.save(save_path)
#         print(f"Saved {prefix} cropped image to: {save_path}\n")

# def main():
#     # Paths (adjust to your environment)
#     pdf_path = "/dbfs/mnt/mini-proj-dd/DDR.pdf"
#     output_folder_boxes = "/dbfs/mnt/mini-proj-dd/cropped_blue_boxes"
#     output_folder_sections = "/dbfs/mnt/mini-proj-dd/cropped_sections"
    
#     # Convert PDF pages to images
#     pages = convert_pdf_to_images(pdf_path, dpi=300)
    
#     for page_index, pil_image in enumerate(pages):
#         print(f"\n--- Processing Page {page_index+1} ---")

#         # Step 1: Detect distinct blue boxes
#         boxes = detect_small_blue_boxes(
#             pil_image,
#             lower_blue=np.array([100, 100, 50]),
#             upper_blue=np.array([130, 255, 255]),
#             min_area=1000,
#             max_area=200000,
#             max_width=None,
#             max_height=200
#         )
#         print(f"Detected {len(boxes)} blue boxes on Page {page_index+1}.")

#         # Draw bounding boxes for debug
#         debug_img = np.array(pil_image.convert("RGB"))
#         for (x, y, w, h) in boxes:
#             cv2.rectangle(debug_img, (x, y), (x + w, y + h), (255, 0, 0), 2)
#         debug_pil = Image.fromarray(debug_img)
#         print("Displaying bounding boxes on the page:")
#         show_image_in_notebook(debug_pil)

#         # Save debug image
#         if not os.path.exists(output_folder_boxes):
#             os.makedirs(output_folder_boxes)
#         boxes_image_path = os.path.join(output_folder_boxes, f"page_{page_index+1}_blue_boxes.png")
#         debug_pil.save(boxes_image_path)
#         print(f"Saved blue boxes image to: {boxes_image_path}\n")

#         # Step 2: Filter out header boxes (height ~31 px)
#         header_boxes = filter_header_boxes(boxes, target_height=31, tolerance=5)
#         print(f"Filtered {len(header_boxes)} header boxes on Page {page_index+1}.")

#         # Crop & save each header box
#         crop_and_display_save_boxes(pil_image, header_boxes, output_folder_boxes, page_index, prefix="Header")

#         # Step 3: Crop sections under each header + the "pre-header" top slice
#         sections, section_coords = crop_sections_by_headers(pil_image, header_boxes, row_tolerance=10)
#         print(f"Cropped {len(sections)} sections from Page {page_index+1}.")

#         # Display & save each section
#         if not os.path.exists(output_folder_sections):
#             os.makedirs(output_folder_sections)
#         for i, section in enumerate(sections):
#             print(f"Section {i+1} coords: {section_coords[i]}")
#             show_image_in_notebook(section)
#             section_filename = os.path.join(output_folder_sections, f"page_{page_index+1}_section_{i+1}.png")
#             section.save(section_filename)
#             print(f"Saved section image to: {section_filename}\n")

# if __name__ == "__main__":
#     main()
