In [0]:
%run ./init

In [0]:
import os
import cv2
import pytesseract
import numpy as np
import pandas as pd
import logging
import json
import matplotlib.pyplot as plt
import math

# ---------------------------------------------------------------------
# Logger Setup
# ---------------------------------------------------------------------
logger = logging.getLogger("bit_infoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# ---------------------------------------------------------------------
# Utility: show_image
# ---------------------------------------------------------------------
def show_image(title, img, cmap=None, size=(10,10)):
    plt.figure(figsize=size)
    if cmap:
        plt.imshow(img, cmap=cmap)
    else:
        if len(img.shape) == 3:
            plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        else:
            plt.imshow(img, cmap="gray")
    plt.title(title)
    plt.axis("off")
    plt.show()

# ---------------------------------------------------------------------
# Utility: annotate_ocr_results
# ---------------------------------------------------------------------
def annotate_ocr_results(img, roi_texts):
    """
    Draws bounding boxes and OCR text on the image for debugging.
    """
    annotated = img.copy()
    for (x, y, w, h, text) in roi_texts:
        cv2.rectangle(annotated, (x, y), (x+w, y+h), (0,255,0), 2)
        cv2.putText(annotated, text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (0, 0, 255), 2, cv2.LINE_AA)
    show_image("Annotated OCR Results", annotated)

# ---------------------------------------------------------------------
# Utility: safe_read_image
# ---------------------------------------------------------------------
def safe_read_image(img_path):
    """
    Reads an image from a local or DBFS path.
    """
    local_path = img_path.replace("dbfs:", "/dbfs") if img_path.startswith("dbfs:") else img_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise ValueError(f"Failed to load image: {local_path}")
    return img

# ---------------------------------------------------------------------
# Utility: preprocess_image
# ---------------------------------------------------------------------
def preprocess_image(img, debug=False):
    """
    Converts image to grayscale and applies adaptive thresholding.
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if debug:
        show_image("1) Grayscale", gray, cmap="gray")
    
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 9
    )
    if debug:
        show_image("2) Thresholded", thresh, cmap="gray")
    return thresh

# ---------------------------------------------------------------------
# Utility: detect_text_regions
# ---------------------------------------------------------------------
def detect_text_regions(thresh_img, debug=False):
    """
    Detects text regions (bounding boxes) from the thresholded image.
    Only keeps bounding boxes larger than (width > 30, height > 15) and
    sorts them top-to-bottom, then left-to-right.
    """
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    
    rois.sort(key=lambda b: (b[1], b[0]))
    
    if debug:
        debug_img = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
        for (x, y, w, h) in rois:
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0,255,0), 2)
        show_image("3) Detected Regions", debug_img)
    
    return rois

# ---------------------------------------------------------------------
# Utility: perform_ocr_on_rois
# ---------------------------------------------------------------------
def perform_ocr_on_rois(img, rois, debug=False):
    """
    Performs OCR on each detected text region. For each region, padding is added,
    the image is lightly dilated and upscaled, then Tesseract is applied using
    a whitelist (with a double-escaped backslash).
    Returns a list of tuples: (x, y, w, h, text).
    """
    results = []
    
    for i, (x, y, w, h) in enumerate(rois):
        # Add padding for better context
        pad = 5
        x1 = max(x - pad, 0)
        y1 = max(y - pad, 0)
        x2 = min(x + w + pad, img.shape[1])
        y2 = min(y + h + pad, img.shape[0])
        roi = img[y1:y2, x1:x2]
        
        gray_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
        kernel = np.ones((1,1), np.uint8)
        dilated = cv2.dilate(gray_roi, kernel, iterations=1)
        thresh_roi = cv2.adaptiveThreshold(
            dilated, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 15, 9
        )
        scaled_roi = cv2.resize(thresh_roi, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
        
        config_str = (
            "--psm 6 "
            "-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            "abcdefghijklmnopqrstuvwxyz0123456789.,:;-_/\\\\"
        )
        text = pytesseract.image_to_string(scaled_roi, config=config_str).strip()
        
        # If nothing is detected, check if the region is very narrow (likely an "I")
        if not text:
            if (w / h) < 0.3:
                text = "I"
            else:
                text = "[BLANK]"
        
        results.append((x, y, w, h, text))
        if debug:
            logger.info(f"OCR Box {i} => {text}")
    
    return results

# ---------------------------------------------------------------------
# Utility: build_bit_info_dict_from_rois (Dynamic Segmentation)
# ---------------------------------------------------------------------
def build_bit_info_dict_from_rois(roi_texts, debug=False):
    """
    Dynamically segments OCR tokens into columns based on their x-coordinate.
    The approach:
      1. Group OCR results into rows based on y-coordinates.
      2. For each row, retain each token’s left x-coordinate and text.
      3. For data rows (assumed to be rows 3+), determine the global min and max x,
         then compute 22 equally spaced boundaries (for 21 columns).
      4. For every row (header or data), assign tokens to columns based on these boundaries.
      5. Map the 21 tokens per row to the final column names.
    """
    # Group tokens into rows by y coordinate
    row_tolerance = 10
    rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, text))
        else:
            rows.append(current_row)
            current_row = [(x, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)
    
    # Sort tokens in each row by their x coordinate
    for i in range(len(rows)):
        rows[i].sort(key=lambda tup: tup[0])
    
    if debug:
        for i, row in enumerate(rows):
            logger.info(f"Row {i}: {[text for (x, text) in row]}")
    
    # Assume rows 0-2 are headers and rows 3+ are data rows.
    header_rows = rows[:3]
    data_rows = rows[3:]
    
    if not data_rows:
        logger.warning("No data rows available for dynamic segmentation.")
        return [], pd.DataFrame()
    
    # Gather x coordinates from all tokens in the data rows to compute column boundaries
    all_x = [x for row in data_rows for (x, _) in row]
    min_x = min(all_x)
    max_x = max(all_x)
    # Compute 22 boundaries for 21 columns
    boundaries = np.linspace(min_x, max_x, 22)
    
    def assign_tokens_to_columns(row_tokens):
        cols = [""] * 21
        for (x, text) in row_tokens:
            # Find the column index for the token based on its x coordinate
            for i in range(21):
                if boundaries[i] <= x < boundaries[i+1]:
                    if cols[i]:
                        cols[i] += " " + text
                    else:
                        cols[i] = text
                    break
        return cols
    
    # Process all rows using the computed boundaries
    processed_rows = [assign_tokens_to_columns(row) for row in rows]
    if debug:
        for i, cols in enumerate(processed_rows):
            logger.info(f"Processed Row {i}: {cols}")
    
    # Define final column names (21 columns expected)
    final_columns = [
        "Bit #", "Size", "Make", "Model", "Serial #",         # 5
        "Nozzle-(Number x Size)", "Nozzle-TFA",                # 2
        "Depth-In", "Depth-Out", "Depth-Feet", "Depth-ROP",    # 4
        "Hours-Total", "Hours-On Btm",                         # 2
        "Dull Grade-I", "Dull Grade-O1", "Dull Grade-D", "Dull Grade-L",
        "Dull Grade-B", "Dull Grade-G", "Dull Grade-O2", "Dull Grade-RP"  # 8
    ]
    
    # Build structured data only from the data rows (row index 3 onward)
    structured_data = []
    for cols in processed_rows[3:]:
        if len(cols) < 21:
            cols += [""] * (21 - len(cols))
        elif len(cols) > 21:
            cols = cols[:21]
        row_dict = {final_columns[i]: cols[i] for i in range(21)}
        structured_data.append(row_dict)
        if debug:
            logger.info(f"Parsed row => {row_dict}")
    
    df = pd.DataFrame(structured_data)
    if debug:
        logger.info("DataFrame Preview:")
        logger.info(df.head())
    
    df.to_csv("bit_info_data.csv", index=False)
    logger.info("Data saved successfully as CSV.")
    
    structured_data_json = df.to_dict(orient='records')
    with open("bit_info_data.json", "w") as json_file:
        json.dump(structured_data_json, json_file, indent=4)
    logger.info("Data saved successfully in JSON format.")
    
    return structured_data_json, df

# ---------------------------------------------------------------------
# Main Pipeline
# ---------------------------------------------------------------------
def main_bit_info_pipeline():
    """
    Main pipeline for extracting the BIT DETAILS table.
    """
    # Replace with your actual image path
    bit_info_img_path = "/dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png"
    
    try:
        img = safe_read_image(bit_info_img_path)
        logger.info("Image loaded successfully.")
    except Exception as e:
        logger.error(e)
        return
    
    # 1) Preprocess the image
    thresh_img = preprocess_image(img, debug=True)
    
    # 2) Detect text regions
    rois = detect_text_regions(thresh_img, debug=True)
    
    # 3) Perform OCR on each region
    roi_texts = perform_ocr_on_rois(img, rois, debug=True)
    
    # 4) Annotate the OCR results on the image
    annotate_ocr_results(img, roi_texts)
    
    # 5) Dynamically build structured data from OCR results
    bit_info_list, df = build_bit_info_dict_from_rois(roi_texts, debug=True)
    
    # 6) Log and print final JSON and DataFrame
    final_output = {"BIT DETAILS": bit_info_list}
    logger.info(json.dumps(final_output, indent=4))
    print(df)
    
    # 7) Save final results
    output_folder = "/dbfs/mnt/mini-proj-dd/final_bit_info_results"
    os.makedirs(output_folder, exist_ok=True)
    with open(os.path.join(output_folder, "bit_info_data.json"), "w") as f:
        json.dump(final_output, f, indent=4)
    df.to_csv(os.path.join(output_folder, "bit_info_data.csv"), index=False)
    logger.info("Data saved successfully in output folder.")

# Entry point
if __name__ == "__main__":
    main_bit_info_pipeline()


In [0]:
import os
import re
import cv2
import pytesseract
import numpy as np
import pandas as pd
import logging
import json
import matplotlib.pyplot as plt
import math

# ---------------------------------------------------------------------
# Logger Setup
# ---------------------------------------------------------------------
logger = logging.getLogger("bit_infoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# ---------------------------------------------------------------------
# Utility: show_image
# ---------------------------------------------------------------------
def show_image(title, img, cmap=None, size=(10,10)):
    plt.figure(figsize=size)
    if cmap:
        plt.imshow(img, cmap=cmap)
    else:
        if len(img.shape) == 3:
            plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        else:
            plt.imshow(img, cmap="gray")
    plt.title(title)
    plt.axis("off")
    plt.show()

# ---------------------------------------------------------------------
# annotate_ocr_results
# ---------------------------------------------------------------------
def annotate_ocr_results(img, roi_texts):
    """
    Draws bounding boxes and OCR text on the image for debugging.
    """
    annotated = img.copy()
    for (x, y, w, h, text) in roi_texts:
        cv2.rectangle(annotated, (x, y), (x+w, y+h), (0,255,0), 2)
        cv2.putText(annotated, text, (x, y-5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2, cv2.LINE_AA)
    show_image("Annotated OCR Results", annotated)

# ---------------------------------------------------------------------
# safe_read_image
# ---------------------------------------------------------------------
def safe_read_image(img_path):
    """
    Reads an image from a local or DBFS path.
    """
    local_path = img_path.replace("dbfs:", "/dbfs") if img_path.startswith("dbfs:") else img_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise ValueError(f"Failed to load image: {local_path}")
    return img

# ---------------------------------------------------------------------
# preprocess_image
# ---------------------------------------------------------------------
def preprocess_image(img, debug=False):
    """
    Converts image to grayscale, applies morphological closing,
    then applies adaptive thresholding.
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if debug:
        show_image("1) Grayscale", gray, cmap="gray")

    # Morphological closing to help keep thin strokes (like 'I')
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
    closed = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
    if debug:
        show_image("2) After Morphological Closing", closed, cmap="gray")

    # Adaptive threshold
    thresh = cv2.adaptiveThreshold(
        closed, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 9
    )
    if debug:
        show_image("3) Thresholded", thresh, cmap="gray")
    return thresh

# ---------------------------------------------------------------------
# detect_text_regions
# ---------------------------------------------------------------------
def detect_text_regions(thresh_img, debug=True):
    """
    Detects text regions (bounding boxes) from the thresholded image.
    We keep bounding boxes > (w=30, h=15).
    """
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    debug_img = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0,255,0), 2)
    rois.sort(key=lambda b: (b[1], b[0]))
    if debug:
        logger.info(f"[detect_text_regions] Found {len(rois)} bounding boxes passing size filter.")
        show_image("3) Detected Text Regions", debug_img, size=(12,12))
    return rois

# ---------------------------------------------------------------------
# perform_ocr_on_rois
# ---------------------------------------------------------------------
def perform_ocr_on_rois(img, rois, debug=True):
    """
    Performs OCR on each detected text region with Tesseract config.
    If Tesseract returns blank for a suspiciously narrow ROI,
    we do a second pass with morphological dilation + PSM 10
    (single char mode).
    """
    results = []
    n = len(rois)

    if debug and n > 0:
        cols = 5
        rows = math.ceil(n / cols)
        fig, axes = plt.subplots(rows, cols, figsize=(15, 3 * rows))
        axes = axes.flatten() if rows > 1 else [axes]

    # Normal Tesseract config (PSM 6, with a whitelist)
    normal_config = (
        "--psm 6 "
        "-c tessedit_char_whitelist="
        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "abcdefghijklmnopqrstuvwxyz"
        "0123456789_./\\-+"
    )

    # Second-pass config (PSM 10 for single character)
    single_char_config = (
        "--psm 10 "
        "-c tessedit_char_whitelist="
        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "abcdefghijklmnopqrstuvwxyz"
        "0123456789_./\\-+"
    )

    for i, (x, y, w, h) in enumerate(rois):
        roi = img[y:y+h, x:x+w]

        # --- First pass OCR ---
        text = pytesseract.image_to_string(roi, config=normal_config).strip()
        if debug:
            logger.info(f"[perform_ocr_on_rois] ROI {i} (x={x}, y={y}, w={w}, h={h}) => First pass: '{text}'")

        # If blank and bounding box is narrow (e.g. w < 20),
        # try second pass with morphological dilation + PSM 10
        if (not text) and (w < 20):
            if debug:
                logger.info(f"[perform_ocr_on_rois] ROI {i} => First pass blank & w<20; attempting second pass.")

            # Dilate the ROI a bit to emphasize thin strokes
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
            roi_dilated = cv2.dilate(roi, kernel, iterations=1)

            # Second pass
            text2 = pytesseract.image_to_string(roi_dilated, config=single_char_config).strip()
            if debug:
                logger.info(f"[perform_ocr_on_rois] ROI {i} => Second pass result: '{text2}'")

            if text2:
                text = text2  # Use second-pass result
            else:
                text = "[BLANK]"
        elif not text:
            # If blank but not narrow, keep blank
            text = "[BLANK]"

        results.append((x, y, w, h, text))

        if debug and i < len(axes):
            roi_rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
            axes[i].imshow(roi_rgb)
            axes[i].set_title(f"ROI {i+1}\n{text[:30]}...")
            axes[i].axis("off")

    if debug and n > 0:
        # Turn off any extra subplot axes
        for j in range(i + 1, len(axes)):
            axes[j].axis("off")
        plt.tight_layout()
        plt.show()

    return results

# ---------------------------------------------------------------------
# build_bit_info_dict_from_rois
# ---------------------------------------------------------------------
def build_bit_info_dict_from_rois(roi_texts, debug=True):
    """
    Groups bounding boxes into rows, merges text in left->right order,
    then parses them into a final structure.
    """
    row_tolerance = 3  # keep it small to avoid merging separate lines
    grouped_rows = []
    current_row = []
    prev_y = None

    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            grouped_rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        grouped_rows.append(current_row)

    row_strings = []
    for i, row_cells in enumerate(grouped_rows):
        row_cells.sort(key=lambda c: c[0])  # left->right
        line = " ".join(cell[4] for cell in row_cells)
        line = line.replace("\n", " ").strip()
        if debug:
            logger.info(f"[build_bit_info_dict_from_rois] Row {i} => {line}")
        row_strings.append(line)

    if len(row_strings) < 3:
        logger.warning("[build_bit_info_dict_from_rois] Not enough rows found for this layout.")
        return [], pd.DataFrame()

    # Row 0 => Title
    super_header_line = row_strings[1] if len(row_strings) > 1 else ""
    sub_header_line   = row_strings[2] if len(row_strings) > 2 else ""
    data_lines        = row_strings[3:]  # subsequent lines

    if debug:
        logger.info(f"[build_bit_info_dict_from_rois] Super Headers => {super_header_line}")
        logger.info(f"[build_bit_info_dict_from_rois] Sub Headers => {sub_header_line}")
        logger.info(f"[build_bit_info_dict_from_rois] Data Lines => {data_lines}")

    final_columns = [
        "Bit #", "Size", "Make", "Model", "Serial #",         # 5
        "Nozzle-(Number x Size)", "Nozzle-TFA",               # 2
        "Depth-In", "Depth-Out", "Depth-Feet", "Depth-ROP",   # 4
        "Hours-Total", "Hours-On Btm",                        # 2
        "Dull Grade-I", "Dull Grade-O1", "Dull Grade-D", "Dull Grade-L",
        "Dull Grade-B", "Dull Grade-G", "Dull Grade-O2", "Dull Grade-RP"  # 8
    ]
    expected_token_count = len(final_columns)

    structured_data = []
    for line in data_lines:
        tokens = line.split()
        if len(tokens) < expected_token_count:
            tokens += ["[BLANK]"] * (expected_token_count - len(tokens))
        elif len(tokens) > expected_token_count:
            tokens = tokens[:expected_token_count]

        row_dict = {}
        for col_idx, col_name in enumerate(final_columns):
            row_dict[col_name] = tokens[col_idx] if col_idx < len(tokens) else "[BLANK]"
        if debug:
            logger.info(f"[build_bit_info_dict_from_rois] Parsed row => {row_dict}")
        structured_data.append(row_dict)

    df = pd.DataFrame(structured_data)

    # No forced "I" fix here (fix_suspicious_I removed)

    if debug:
        logger.info("[build_bit_info_dict_from_rois] DataFrame Preview:")
        logger.info(df.head().to_string())

    df.to_csv("bit_info_data.csv", index=False)
    logger.info("Data saved successfully as CSV.")

    structured_data_json = df.to_dict(orient='records')
    with open("bit_info_data.json", "w") as json_file:
        json.dump(structured_data_json, json_file, indent=4)
    logger.info("Data saved successfully in JSON format.")

    return structured_data_json, df

# ---------------------------------------------------------------------
# main_bit_info_pipeline
# ---------------------------------------------------------------------
def main_bit_info_pipeline():
    """
    Main pipeline for extracting the BIT DETAILS table from your layout.
    """
    # Replace with your actual path
    bit_info_img_path = "/dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png"

    try:
        img = safe_read_image(bit_info_img_path)
        logger.info("Image loaded successfully.")
    except Exception as e:
        logger.error(e)
        return

    # 1) Preprocess
    thresh_img = preprocess_image(img, debug=True)

    # 2) Detect bounding boxes
    rois = detect_text_regions(thresh_img, debug=True)

    # 3) Perform OCR
    roi_texts = perform_ocr_on_rois(img, rois, debug=True)

    # 4) Annotate and show OCR results on the image
    annotate_ocr_results(img, roi_texts)

    # 5) Build structured data
    bit_info_list, df = build_bit_info_dict_from_rois(roi_texts, debug=True)

    # 6) Show final JSON in logs
    final_output = {"BIT DETAILS": bit_info_list}
    logger.info(json.dumps(final_output, indent=4))
    print(df)

    # 7) Save final results
    output_folder = "/dbfs/mnt/mini-proj-dd/final_bit_info_results"
    os.makedirs(output_folder, exist_ok=True)
    with open(os.path.join(output_folder, "bit_info_data.json"), "w") as f:
        json.dump(final_output, f, indent=4)
    df.to_csv(os.path.join(output_folder, "bit_info_data.csv"), index=False)
    logger.info("Data saved successfully in output folder.")

if __name__ == "__main__":
    main_bit_info_pipeline()


In [0]:
import os
import re
import cv2
import pytesseract
import numpy as np
import pandas as pd
import logging
import json
import matplotlib.pyplot as plt

# ---------------------------------------------------------------------
# Logger Setup
# ---------------------------------------------------------------------
logger = logging.getLogger("bit_infoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# ---------------------------------------------------------------------
# Utility: show_image
# ---------------------------------------------------------------------
def show_image(title, img, cmap=None, size=(10,10)):
    plt.figure(figsize=size)
    if cmap:
        plt.imshow(img, cmap=cmap)
    else:
        if len(img.shape) == 3:
            plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        else:
            plt.imshow(img, cmap="gray")
    plt.title(title)
    plt.axis("off")
    plt.show()

# ---------------------------------------------------------------------
# New Utility: annotate_ocr_results
# ---------------------------------------------------------------------
def annotate_ocr_results(img, roi_texts):
    """
    Draws bounding boxes and OCR text on the image for debugging.
    """
    annotated = img.copy()
    for (x, y, w, h, text) in roi_texts:
        cv2.rectangle(annotated, (x, y), (x+w, y+h), (0,255,0), 2)
        # Place the OCR text above the bounding box; adjust as needed.
        cv2.putText(annotated, text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (0, 0, 255), 2, cv2.LINE_AA)
    show_image("Annotated OCR Results", annotated)

# ---------------------------------------------------------------------
# safe_read_image
# ---------------------------------------------------------------------
def safe_read_image(img_path):
    """
    Reads an image from a local or DBFS path.
    """
    local_path = img_path.replace("dbfs:", "/dbfs") if img_path.startswith("dbfs:") else img_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise ValueError(f"Failed to load image: {local_path}")
    return img

# ---------------------------------------------------------------------
# preprocess_image
# ---------------------------------------------------------------------
def preprocess_image(img, debug=False):
    """
    Converts image to grayscale and applies adaptive thresholding.
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if debug:
        show_image("1) Grayscale", gray, cmap="gray")
    
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 9
    )
    if debug:
        show_image("2) Thresholded", thresh, cmap="gray")
    return thresh

# ---------------------------------------------------------------------
# detect_text_regions
# ---------------------------------------------------------------------
def detect_text_regions(thresh_img, debug=False):
    """
    Detects text regions (bounding boxes) from the thresholded image.
    Only keeps bounding boxes larger than (width>30, height>15).
    Sort them top-to-bottom, then left-to-right.
    """
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    
    rois.sort(key=lambda b: (b[1], b[0]))  # top->bottom, then left->right

    if debug:
        debug_img = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
        for (x, y, w, h) in rois:
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0,255,0), 2)
        show_image("3) Detected Regions", debug_img)
    
    return rois

# ---------------------------------------------------------------------
# perform_ocr_on_rois
# ---------------------------------------------------------------------
def perform_ocr_on_rois(img, rois, debug=False):
    """
    Performs OCR on each detected text region.
    Returns list of tuples: (x, y, w, h, text).
    """
    results = []
    for i, (x, y, w, h) in enumerate(rois):
        roi = img[y:y+h, x:x+w]
        # Using a whitelist can be added here if needed:
        # config_str = "--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        text = pytesseract.image_to_string(roi, config="--psm 6").strip() or "[BLANK]"
        results.append((x, y, w, h, text))
        if debug:
            logger.info(f"OCR Box {i}: {text}")
    return results

# ---------------------------------------------------------------------
# build_bit_info_dict_from_rois
# ---------------------------------------------------------------------
def build_bit_info_dict_from_rois(roi_texts, debug=False):
    """
    Custom parsing for the multi-row header layout:
      Row 0 => Table Title (e.g. "DRILL BITS ...")
      Row 1 => Super Headers: "Bit Data  Nozzles  Depth  Hours  Dull Grade"
      Row 2 => Sub-headers:   "Bit # Size Make Model Serial #  Number x Size TFA  In Out Feet ROP  Total On Btm  I oO D L B G oO RP"
      Row 3 => Data row #1
      Row 4 => Data row #2
      ...
    We'll parse row 1 and row 2 to define column groups. Then parse each subsequent row in chunks.
    """
    # Step 1) Group bounding boxes by y-coordinate
    row_tolerance = 10
    grouped_rows = []
    current_row = []
    prev_y = None

    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            grouped_rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        grouped_rows.append(current_row)

    # Step 2) Convert each row group into a single string
    row_strings = []
    for i, row_cells in enumerate(grouped_rows):
        row_cells.sort(key=lambda c: c[0])  # left->right
        line = " ".join(cell[4] for cell in row_cells)
        line = line.replace("\n", " ").strip()  # flatten
        row_strings.append(line)
        if debug:
            logger.info(f"Row {i} => {line}")

    # We expect something like:
    # Row 0 => "DRILL BITS DRILL BITS [BLANK]"
    # Row 1 => "Bit Data Nozzles Depth Hours Dull Grade"
    # Row 2 => "Bit # Size Make Model Serial # Number x Size TFA In Out Feet ROP Total On Btm I oO D L B G oO RP"
    # Row 3 => "4 6.750 BAKER DD40+TWS 5355166 6X12 0.66 ..."
    # Row 4 => "3 9.875 REED TKS6-H1 A308739 7X12 0.77 ..."

    # Step 3) Identify the row indices for:
    #  - Title (row 0)
    #  - Super headers (row 1)
    #  - Sub-headers (row 2)
    #  - Data rows (row 3, 4, ...)
    if len(row_strings) < 3:
        logger.warning("Not enough rows found for this layout.")
        return [], pd.DataFrame()

    # We'll skip row 0 (table title).
    super_header_line = row_strings[1] if len(row_strings) > 1 else ""
    sub_header_line   = row_strings[2] if len(row_strings) > 2 else ""
    data_lines        = row_strings[3:]  # everything after row 2

    if debug:
        logger.info(f"Super Headers => {super_header_line}")
        logger.info(f"Sub Headers => {sub_header_line}")
        logger.info(f"Data Lines => {data_lines}")

    # Step 4) Define the "super header" groups and sub-headers
    # We'll do a simpler approach: we know how many tokens each group has:
    #  Bit Data => 5, Nozzles => 2, Depth => 4, Hours => 2, Dull Grade => 8 (Total = 21)
    final_columns = [
        "Bit #", "Size", "Make", "Model", "Serial #",         # 5
        "Nozzle-(Number x Size)", "Nozzle-TFA",               # 2
        "Depth-In", "Depth-Out", "Depth-Feet", "Depth-ROP",   # 4
        "Hours-Total", "Hours-On Btm",                        # 2
        "Dull Grade-I", "Dull Grade-O1", "Dull Grade-D", "Dull Grade-L", 
        "Dull Grade-B", "Dull Grade-G", "Dull Grade-O2", "Dull Grade-RP"  # 8
    ]

    # Step 5) Parse each data row in chunks of 21 tokens
    structured_data = []
    for line in data_lines:
        tokens = line.split()
        # We expect 21 tokens per data row; pad or truncate if necessary
        if len(tokens) < 21:
            tokens += [""] * (21 - len(tokens))
        elif len(tokens) > 21:
            tokens = tokens[:21]

        row_dict = {}
        for col_idx, col_name in enumerate(final_columns):
            row_dict[col_name] = tokens[col_idx] if col_idx < len(tokens) else ""

        structured_data.append(row_dict)
        if debug:
            logger.info(f"Parsed row => {row_dict}")

    # Step 6) Convert to DataFrame and JSON
    df = pd.DataFrame(structured_data)
    if debug:
        logger.info("DataFrame Preview:")
        logger.info(df.head())

    df.to_csv("bit_info_data.csv", index=False)
    logger.info("Data saved successfully as CSV.")

    structured_data_json = df.to_dict(orient='records')
    with open("bit_info_data.json", "w") as json_file:
        json.dump(structured_data_json, json_file, indent=4)
    logger.info("Data saved successfully in JSON format.")

    return structured_data_json, df

# ---------------------------------------------------------------------
# main_bit_info_pipeline
# ---------------------------------------------------------------------
def main_bit_info_pipeline():
    """
    Main pipeline for extracting the BIT DETAILS table from your layout.
    """
    # Replace with your actual path
    bit_info_img_path = "/dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png"

    try:
        img = safe_read_image(bit_info_img_path)
        logger.info("Image loaded successfully.")
    except Exception as e:
        logger.error(e)
        return

    # 1) Preprocess
    thresh_img = preprocess_image(img, debug=True)

    # 2) Detect bounding boxes
    rois = detect_text_regions(thresh_img, debug=True)

    # 3) Perform OCR
    roi_texts = perform_ocr_on_rois(img, rois, debug=True)
    
    # --- New Step: Annotate and show OCR results on the image ---
    annotate_ocr_results(img, roi_texts)

    # 4) Build structured data (tailored to your table layout)
    bit_info_list, df = build_bit_info_dict_from_rois(roi_texts, debug=True)

    # 5) Show final JSON in logs
    final_output = {"BIT DETAILS": bit_info_list}
    logger.info(json.dumps(final_output, indent=4))
    print(df)

    # 6) Save final results
    output_folder = "/dbfs/mnt/mini-proj-dd/final_bit_info_results"
    os.makedirs(output_folder, exist_ok=True)
    with open(os.path.join(output_folder, "bit_info_data.json"), "w") as f:
        json.dump(final_output, f, indent=4)
    df.to_csv(os.path.join(output_folder, "bit_info_data.csv"), index=False)
    logger.info("Data saved successfully in output folder.")

if __name__ == "__main__":
    main_bit_info_pipeline()
