In [0]:
def parse_pumps_table(ocr_text):
    """
    Parses the pumps table from OCR text using a regex.
    Expected format: Number BOMCO TRIPLEX [HHP] Efficiency Stroke(in) Liner(in) P-Rating P-Limit SPM_Rating SPM_Limit
    """
    pump_pattern = re.compile(
        r"^(\d+)?\s*"               # Number (optional)
        r"(BOMCO)\s+(TRIPLEX)\s+"    # Model, Type
        r"(\d+)?\s*"                # HHP (optional)
        r"(\d+)\s+"                 # Efficiency
        r"([\d.]+)\s+"              # Stroke\(in\)
        r"([\d.]+)\s+"              # Liner\(in\)
        r"(\d+)\s+"                 # P-Rating\(psi\)
        r"(\d+)\s+"                 # P-Limit\(psi\)
        r"(\d+)\s+"                 # SPM Rating
        r"(\d+)\s*$",               # SPM Limit
        re.IGNORECASE
    )
    pumps = []
    lines = ocr_text.splitlines()
    for line in lines:
        line = line.strip()
        match = pump_pattern.match(line)
        if match:
            number, model, pump_type, hhp, efficiency, stroke, liner, p_rating, p_limit, spm_rating, spm_limit = match.groups()
            pumps.append({
                "Number": number if number else "",
                "Model": model,
                "Type": pump_type,
                "HHP": hhp if hhp else "",
                "Efficiency": efficiency,
                "Stroke(in)": stroke,
                "Liner(in)": liner,
                "P-Rating(psi)": p_rating,
                "P-Limit(psi)": p_limit,
                "SPM Rating": spm_rating,
                "SPM Limit": spm_limit
            })
    return pumps
    
def parse_drilling_circ_rates(ocr_text):
    """
    Parses drilling/circ rates from OCR text.
    This version splits the text into segments starting with "Drilling/Circ Rate <n>"
    then combines the lines in each segment and applies a regex with DOTALL.
    """
    circ_rates = []
    
    # Split the OCR text into segments where each segment begins with "Drilling/Circ Rate" followed by a digit
    segments = re.split(r"(?=Drilling/Circ Rate \d+)", ocr_text)
    
    # Regex pattern to capture the numbers:
    # Group 1: Rate ID (the number after "Drilling/Circ Rate")
    # Group 2: Pressure (number preceding "PS!" or "PSI")
    # Group 3: SPM value (number after "@")
    # Group 4: Gal/Stoke value
    # Group 5: GPM value
    # Group 6: BPM value
    # Group 7: DC value
    # Group 8: DP value
    pattern = re.compile(
        r"Drilling/Circ Rate\s+(\d+).*?"       # Rate ID
        r"(\d+)\s+PS[!I].*?"                   # Pressure
        r"@\s*(\d+).*?"                        # SPM value
        r"([\d.]+)\s+Gal/Stoke.*?"              # Gal/Stoke
        r"([\d.]+)\s+GPM.*?"                    # GPM
        r"([\d.]+)\s+BPM.*?"                    # BPM
        r"([\d.]+)\s+DC.*?"                     # DC
        r"([\d.]+)\s+DP",                      # DP
        re.IGNORECASE | re.DOTALL
    )
    
    # Process each segment individually
    for seg in segments:
        seg = seg.strip()
        if not seg.startswith("Drilling/Circ Rate"):
            continue  # Skip any header or unrelated segments
        # Replace newline characters with spaces to form a continuous string
        seg_clean = " ".join(seg.splitlines())
        match = pattern.search(seg_clean)
        if match:
            rate_id, pressure, spm, gal_stroke, gpm, bpm, dc, dp = match.groups()
            circ_rates.append({
                "RateID": rate_id,
                "Pressure(PSI)": pressure,
                "SPM": spm,
                "Gal/Stoke": gal_stroke,
                "GPM": gpm,
                "BPM": bpm,
                "DC": dc,
                "DP": dp
            })
        else:
            # Optional: log a warning if no match is found for a segment
            print(f"Warning: No match found in segment:\n{seg_clean}")
            
    return circ_rates

def process_pumps(pumps_img_path, debug=True):
    """
    Processes the pumps section:
      - Reads image using PIL,
      - Performs OCR,
      - Parses pumps table and drilling/circ rates,
      - Returns combined results as JSON and a DataFrame.
    """
    pil_img = safe_read_image(pumps_img_path)
    ocr_text = perform_ocr(pil_img)
    if debug:
        logger.info("Pumps OCR Text:\n" + ocr_text)
    pumps = parse_pumps_table(ocr_text)
    circ_rates = parse_drilling_circ_rates(ocr_text)
    final_data = {"Pumps": pumps, "DrillingCircRates": circ_rates}
    df_pumps = pd.DataFrame(pumps)
    df_circ = pd.DataFrame(circ_rates)
    if not df_pumps.empty and not df_circ.empty:
        df = pd.concat([df_pumps, df_circ], axis=0, ignore_index=False)
    elif not df_pumps.empty:
        df = df_pumps
    else:
        df = df_circ
    return final_data, df

In [0]:
# --------------------------------------------------------
# 5) Personnel Extraction process
# --------------------------------------------------------
def detect_text_regions_personnel(thresh_img, debug=True):
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    debug_img = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0, 255, 0), 2)
    rois.sort(key=lambda b: (b[1], b[0]))
    logger.debug(f"Detected {len(rois)} text regions for personnel.")
    # if debug:
    #     show_image("Detected Personnel Text Regions", debug_img, size=(12, 12))
    return rois

def perform_ocr_on_rois_personnel(img, rois, debug=True):
    results = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config='--psm 6').strip()
        if not text:
            text = "[BLANK]"
        results.append((x, y, w, h, text))
        logger.debug(f"ROI bbox=({x}, {y}, {w}, {h}), text: '{text}'")
    return results

def group_rois_by_row(roi_results, threshold=20):
    roi_with_center = [(x, y, w, h, text, y + h/2) for (x, y, w, h, text) in roi_results]
    roi_with_center.sort(key=lambda r: r[5])
    groups = []
    current_group = []
    current_center = None
    for roi in roi_with_center:
        x, y, w, h, text, y_center = roi
        if current_center is None:
            current_center = y_center
            current_group.append((x, y, w, h, text))
        elif abs(y_center - current_center) < threshold:
            current_group.append((x, y, w, h, text))
        else:
            groups.append(current_group)
            current_group = [(x, y, w, h, text)]
            current_center = y_center
    if current_group:
        groups.append(current_group)
    logger.debug(f"Grouped ROIs into {len(groups)} rows.")
    return groups

def preprocess_personnel_data_from_rows(groups):
    personnel_data = []
    header_lines = {
        "personnel", 
        "company contractor no. personnel daily hours cumulative hours",
        "ssn"
    }
    for group in groups:
        group.sort(key=lambda r: r[0])
        row_text = " ".join([r[4] for r in group]).strip()
        logger.debug(f"Processing row: '{row_text}'")
        if row_text.lower() in header_lines:
            logger.debug("Skipping header row.")
            continue
        tokens = row_text.split()
        numeric_tokens = re.findall(r'\d+(?:\.\d+)?', row_text)
        logger.debug(f"Row tokens: {tokens}")
        logger.debug(f"Numeric tokens found: {numeric_tokens}")
        if tokens[0].lower().startswith("totals"):
            if len(numeric_tokens) >= 2:
                try:
                    daily_hours = int(float(numeric_tokens[0]))
                    cumulative_hours = numeric_tokens[1]
                except ValueError as e:
                    logger.error(f"Error parsing Totals row: {row_text} => {e}")
                    continue
                row_dict = {
                    "Company": "",
                    "Contractor": "",
                    "No. Personnel": "Totals",
                    "Daily Hours": daily_hours,
                    "Cumulative Hours": cumulative_hours
                }
                logger.info(f"Totals row parsed: {row_dict}")
                personnel_data.append(row_dict)
            else:
                logger.warning(f"Totals row without sufficient numbers: {row_text}")
            continue
        if len(numeric_tokens) >= 3:
            try:
                no_personnel = int(float(numeric_tokens[-3]))
                daily_hours = int(float(numeric_tokens[-2]))
                cumulative_hours = int(float(numeric_tokens[-1]))
                logger.debug(f"Extracted: no_personnel={no_personnel}, daily_hours={daily_hours}, cumulative_hours={cumulative_hours}")
            except ValueError as e:
                logger.error(f"Error converting numbers in row: {row_text} => {e}")
                continue
            pattern = (r'\s*' + re.escape(numeric_tokens[-3]) +
                       r'\s+' + re.escape(numeric_tokens[-2]) +
                       r'\s+' + re.escape(numeric_tokens[-1]) + r'\s*$')
            text_only = re.sub(pattern, '', row_text).strip()
        elif len(numeric_tokens) == 1:
            try:
                cumulative_hours = int(float(numeric_tokens[0]))
                logger.debug(f"Single numeric token, cumulative_hours: {cumulative_hours}")
            except ValueError as e:
                logger.error(f"Error converting single number in row: {row_text} => {e}")
                continue
            no_personnel = None
            daily_hours = None
            pattern = r'\s*' + re.escape(numeric_tokens[0]) + r'\s*$'
            text_only = re.sub(pattern, '', row_text).strip()
        else:
            logger.warning(f"Row has unexpected number of numeric tokens: {row_text}")
            continue

        if "service company" in text_only.lower():
            parts = re.split(r'(?i)service company', text_only, maxsplit=1)
            company = parts[0].strip()
            contractor = "Service Company"
        else:
            company = text_only
            contractor = "Service Company"
        
        row_dict = {
            "Company": company,
            "Contractor": contractor,
            "No. Personnel": no_personnel,
            "Daily Hours": daily_hours,
            "Cumulative Hours": cumulative_hours
        }
        logger.info(f"Parsed row: {row_dict}")
        personnel_data.append(row_dict)
    return {"PERSONNEL": personnel_data}

def process_personnel(personnel_img_path, debug=True):
    """Processes the personnel section: read image, detect ROIs, OCR, group and parse rows."""
    img = read_cropped_section_image(personnel_img_path)
    # if debug:
        # show_image("Original Personnel Image", img, size=(10,10))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh_img = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                       cv2.THRESH_BINARY, 11, 2)
    # if debug:
        # show_image("Thresholded Personnel Image", cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR), size=(8,8))
    rois = detect_text_regions_personnel(thresh_img, debug=debug)
    roi_results = perform_ocr_on_rois_personnel(img, rois, debug=debug)
    grouped_rows = group_rois_by_row(roi_results, threshold=20)
    data_dict = preprocess_personnel_data_from_rows(grouped_rows)
    df = pd.DataFrame(data_dict["PERSONNEL"]) if data_dict["PERSONNEL"] else pd.DataFrame(
        columns=["Company", "Contractor", "No. Personnel", "Daily Hours", "Cumulative Hours"])
    return data_dict, df

In [0]:
def build_casing_dict_from_rois(roi_texts, expected_headers, debug=False):
    """
    Groups OCR results for CASING into rows and returns a list of dictionaries.
    """
    row_tolerance = 10
    rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)
    row_strings = []
    for i, row in enumerate(rows):
        row.sort(key=lambda cell: cell[0])
        line = " ".join(cell[4] for cell in row).strip()
        row_strings.append(line)
        if debug:
            logger.info(f"Grouped Row {i}: {line}")
    all_lines = []
    for line in row_strings:
        for sub in line.split("\n"):
            sub = sub.strip()
            if sub:
                all_lines.append(sub)
    if debug:
        logger.info(f"All extracted lines: {all_lines}")
    data_lines = []
    for line in all_lines:
        tokens = re.split(r'\s{2,}', line)
        if len(tokens) == 1:
            tokens = line.split()
        if debug:
            logger.info(f"Processing line: '{line}' -> tokens: {tokens}")
        lower_tokens = [t.lower() for t in tokens]
        if "type" in lower_tokens and "size" in lower_tokens:
            logger.info(f"Skipping header line: {tokens}")
            continue
        if len(tokens) < len(expected_headers):
            logger.warning(f"Line has fewer tokens than expected: {tokens}")
            tokens = tokens + [""] * (len(expected_headers) - len(tokens))
        else:
            tokens = tokens[:len(expected_headers)]
        data_lines.append(tokens)
    casing_list = [{expected_headers[i]: tokens[i] for i in range(len(expected_headers))}
                   for tokens in data_lines]
    if debug:
        logger.info(f"Final casing list: {casing_list}")
    return casing_list

def process_casing(img_path, debug=False):
    img = safe_read_image(img_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    logger.info(f"Detected {len(rois)} text regions for CASING")
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    expected_headers = ["Type", "Size", "Weight", "Grade", "Connection", "Top MD", "Bottom MD", "TOC"]
    casing_list = build_casing_dict_from_rois(roi_texts, expected_headers, debug=debug)
    df = pd.DataFrame(casing_list)
    logger.info(f"CASING DataFrame shape: {df.shape}")
    return {"CASING": casing_list}, df

# --- BOP Pipeline ---
def perform_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config="--psm 6")
    return text.strip()

def extract_bop_info(ocr_text):
    patterns = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in patterns.items():
        match = re.search(regex, ocr_text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    return result

def process_bop(img_path, debug=False):
    img = safe_read_image(img_path)
    ocr_text = perform_ocr(img)
    logger.info("BOP OCR extraction complete.")
    bop_info = extract_bop_info(ocr_text)
    df = pd.DataFrame(list(bop_info.items()), columns=["Key", "Value"])
    logger.info(f"BOP DataFrame shape: {df.shape}")
    return {"BOP": bop_info}, df

# --- CONSUMABLES Pipeline ---
def build_consumables_dict_from_rois(roi_texts, debug=False):
    row_tolerance = 10
    rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)
    grouped_rows = []
    for i, row in enumerate(rows):
        row.sort(key=lambda cell: cell[0])
        line = " ".join(cell[4] for cell in row).strip()
        grouped_rows.append(line)
        if debug:
            logger.info(f"Grouped Row {i}: {line}")
    data_rows = []
    for line in grouped_rows:
        lower_line = line.lower()
        if ("consumable" in lower_line and "received" in lower_line) or "nun" in lower_line:
            continue
        if len(line.split()) < 5:
            continue
        data_rows.append(line)
    if debug:
        logger.info(f"Data rows to parse: {data_rows}")
    consumables_list = []
    for line in data_rows:
        tokens = re.split(r'\s+', line)
        if len(tokens) > 5:
            first = " ".join(tokens[:-4])
            tokens = [first] + tokens[-4:]
        if len(tokens) != 5:
            logger.warning(f"Skipping row (unexpected token count): {tokens}")
            continue
        row_dict = {
            "Consumable": tokens[0],
            "Daily Received (gal)": tokens[1],
            "Daily Used (gal)": tokens[2],
            "Cumulative Used (gal)": tokens[3],
            "Daily on Hand (gal)": tokens[4]
        }
        consumables_list.append(row_dict)
    return consumables_list

def process_consumables(img_path, debug=False):
    img = safe_read_image(img_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    logger.info(f"Detected {len(rois)} text regions for CONSUMABLES")
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    consumables_list = build_consumables_dict_from_rois(roi_texts, debug=debug)
    df = pd.DataFrame(consumables_list)
    logger.info(f"CONSUMABLES DataFrame shape: {df.shape}")
    return {"CONSUMABLES": consumables_list}, df


In [0]:

# ---------------------------------------------------------------------
# build_bit_info_dict_from_rois
# ---------------------------------------------------------------------
def build_bit_info_dict_from_rois(roi_texts, debug=True):
    """
    Custom parsing for the multi-row header layout:
      Row 0 => Table Title (e.g. "DRILL BITS ...")
      Row 1 => Super Headers: "Bit Data  Nozzles  Depth  Hours  Dull Grade"
      Row 2 => Sub-headers:   "Bit # Size Make Model Serial #  Number x Size TFA  In Out Feet ROP  Total On Btm  I oO D L B G oO RP"
      Row 3 => Data row #1
      Row 4 => Data row #2
      ...
    We'll parse row 1 and row 2 to define column groups. Then parse each subsequent row in chunks.
    """
    # Step 1) Group bounding boxes by y-coordinate
    row_tolerance = 10
    grouped_rows = []
    current_row = []
    prev_y = None

    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            grouped_rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        grouped_rows.append(current_row)

    # Step 2) Convert each row group into a single string
    row_strings = []
    for i, row_cells in enumerate(grouped_rows):
        row_cells.sort(key=lambda c: c[0])  # left->right
        line = " ".join(cell[4] for cell in row_cells)
        line = line.replace("\n", " ").strip()  # flatten
        row_strings.append(line)
        if debug:
            logger.info(f"Row {i} => {line}")

    # We expect something like:
    # Row 0 => "DRILL BITS DRILL BITS [BLANK]"
    # Row 1 => "Bit Data Nozzles Depth Hours Dull Grade"
    # Row 2 => "Bit # Size Make Model Serial # Number x Size TFA In Out Feet ROP Total On Btm I oO D L B G oO RP"
    # Row 3 => "4 6.750 BAKER DD40+TWS 5355166 6X12 0.66 ..."
    # Row 4 => "3 9.875 REED TKS6-H1 A308739 7X12 0.77 ..."

    # Step 3) Identify the row indices for:
    #  - Title (row 0)
    #  - Super headers (row 1)
    #  - Sub-headers (row 2)
    #  - Data rows (row 3, 4, ...)
    if len(row_strings) < 3:
        logger.warning("Not enough rows found for this layout.")
        return [], pd.DataFrame()

    # We'll skip row 0 (table title).
    super_header_line = row_strings[1] if len(row_strings) > 1 else ""
    sub_header_line   = row_strings[2] if len(row_strings) > 2 else ""
    data_lines        = row_strings[3:]  # everything after row 2

    if debug:
        logger.info(f"Super Headers => {super_header_line}")
        logger.info(f"Sub Headers => {sub_header_line}")
        logger.info(f"Data Lines => {data_lines}")

    # Step 4) Define the "super header" groups and sub-headers
    # We'll do a simpler approach: we know how many tokens each group has:
    #  Bit Data => 5, Nozzles => 2, Depth => 4, Hours => 2, Dull Grade => 8 (Total = 21)
    final_columns = [
        "Bit #", "Size", "Make", "Model", "Serial #",         # 5
        "Nozzle-(Number x Size)", "Nozzle-TFA",               # 2
        "Depth-In", "Depth-Out", "Depth-Feet", "Depth-ROP",   # 4
        "Hours-Total", "Hours-On Btm",                        # 2
        "Dull Grade-I", "Dull Grade-O1", "Dull Grade-D", "Dull Grade-L", 
        "Dull Grade-B", "Dull Grade-G", "Dull Grade-O2", "Dull Grade-RP"  # 8
    ]

    # Step 5) Parse each data row in chunks of 21 tokens
    structured_data = []
    for line in data_lines:
        tokens = line.split()
        # We expect 21 tokens per data row; pad or truncate if necessary
        if len(tokens) < 21:
            tokens += [""] * (21 - len(tokens))
        elif len(tokens) > 21:
            tokens = tokens[:21]

        row_dict = {}
        for col_idx, col_name in enumerate(final_columns):
            row_dict[col_name] = tokens[col_idx] if col_idx < len(tokens) else ""

        structured_data.append(row_dict)
        if debug:
            logger.info(f"Parsed row => {row_dict}")

    # Step 6) Convert to DataFrame and JSON
    df = pd.DataFrame(structured_data)
    if debug:
        logger.info("DataFrame Preview:")
        logger.info(df.head())

    df.to_csv("bit_info_data.csv", index=False)
    logger.info("Data saved successfully as CSV.")

    structured_data_json = df.to_dict(orient='records')
    with open("bit_info_data.json", "w") as json_file:
        json.dump(structured_data_json, json_file, indent=4)
    logger.info("Data saved successfully in JSON format.")

    return structured_data_json, df

# ---------------------------------------------------------------------
# main_bit_info_pipeline
# ---------------------------------------------------------------------
def process_drill_bits(img_path, debug=True):
    """
    Main pipeline for extracting the BIT DETAILS table from your layout.
    """
    # Replace with your actual path
    #bit_info_img_path = "/dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png"

    try:
        img = safe_read_image(img_path)
        logger.info("Image loaded successfully.")
    except Exception as e:
        logger.error(e)
        return

    # 1) Preprocess
    thresh_img = preprocess_image(img, debug=True)

    # 2) Detect bounding boxes
    rois = detect_text_regions(thresh_img, debug=True)

    # 3) Perform OCR
    roi_texts = perform_ocr_on_rois(img, rois, debug=True)
    
    # --- New Step: Annotate and show OCR results on the image ---
    # annotate_ocr_results(img, roi_texts)

    # 4) Build structured data (tailored to your table layout)
    bit_info_list, df = build_bit_info_dict_from_rois(roi_texts, debug=True)

    # 5) Show final JSON in logs
    final_output = {"BIT DETAILS": bit_info_list}
    logger.info(json.dumps(final_output, indent=4))
    print(df)

    # 6) Save final results
    output_folder = "/dbfs/mnt/mini-proj-dd/final_bit_info_results"
    os.makedirs(output_folder, exist_ok=False)
    with open(os.path.join(output_folder, "bit_info_data.json"), "w") as f:
        json.dump(final_output, f, indent=4)
    df.to_csv(os.path.join(output_folder, "bit_info_data.csv"), index=False)
    logger.info("Data saved successfully in output folder.")


In [0]:
def build_dir_info_dict_from_rois(roi_texts, debug=True):
    all_texts = [t[4] for t in roi_texts]
    daily_cum_idx = next((i for i, txt in enumerate(all_texts)
                           if "daily" in txt.lower() and "cumulative" in txt.lower()), None)
    if daily_cum_idx is None:
        logger.warning("Could not find 'Daily Cumulative' bounding box.")
        return {}, pd.DataFrame()
    cat_idx = daily_cum_idx + 1
    if cat_idx >= len(all_texts):
        logger.warning("No bounding box after 'Daily Cumulative'.")
        return {}, pd.DataFrame()
    categories_box = all_texts[cat_idx]
    lines = [ln.strip() for ln in categories_box.split("\n") if ln.strip()]
    if len(lines) < 5:
        logger.warning(f"Expected 5 category lines, got {len(lines)}: {lines}")
    def safe_get(idx):
        return all_texts[idx] if 0 <= idx < len(all_texts) else ""
    structured = []
    for i in range(4):
        cat_name = lines[i] if i < len(lines) else f"Unknown Category {i+1}"
        daily_box = safe_get(cat_idx + 1 + (i * 2))
        cum_box = safe_get(cat_idx + 2 + (i * 2))
        structured.append({
            "Category": cat_name,
            "Daily": "" if daily_box == "[BLANK]" else daily_box,
            "Cumulative": "" if cum_box == "[BLANK]" else cum_box
        })
    last_box = safe_get(cat_idx + 9)
    last_cat = lines[4] if len(lines) >= 5 else "Rotating Footage"
    remainder = last_box.replace(last_cat, "").strip()
    tokens = remainder.split()
    daily_val = tokens[0] if len(tokens) >= 2 else ""
    cum_val = tokens[1] if len(tokens) >= 2 else ""
    structured.append({
        "Category": last_cat,
        "Daily": "" if daily_val == "[BLANK]" else daily_val,
        "Cumulative": "" if cum_val == "[BLANK]" else cum_val
    })
    df = pd.DataFrame(structured)
    logger.info(f"DIR INFO DataFrame shape: {df.shape}")
    return {"DIR INFO": structured}, df

def process_dir_info(section_path, debug=True):
    img = safe_read_image(section_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    return build_dir_info_dict_from_rois(roi_texts, debug=debug)

In [0]:
import os
import re
import cv2
import pytesseract
import numpy as np
import pandas as pd
import logging
import json
import math
#from matplotlib import pyplot as plt  # Commented out since we do not show images
from PIL import Image

# -------------------------------
# Logger Setup
# -------------------------------
logging.basicConfig(level=logging.DEBUG,
                    format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("SectionExtractor")

# -------------------------------
# Utility Functions
# -------------------------------
def dbfs_to_local_path(dbfs_path):
    """Convert a DBFS URI (e.g. "dbfs:/mnt/xxx") to a local path ("/dbfs/mnt/xxx")."""
    if dbfs_path.startswith("dbfs:/"):
        return "/dbfs/" + dbfs_path[len("dbfs:/"):]
    return dbfs_path

def sanitize_section_name(section):
    """Convert section name to safe file name."""
    return section.lower().replace(" ", "_").replace("/", "_").replace(":", "")

def safe_read_image_cv2(image_path):
    local_path = dbfs_to_local_path(image_path)
    logger.info(f"Reading image from: {local_path}")
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"Image file not found at {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise ValueError(f"OpenCV failed to read image: {local_path}")
    return img

def safe_read_image_pil(image_path):
    local_path = dbfs_to_local_path(image_path)
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"Image file not found at {local_path}")
    return Image.open(local_path)

# (Note: show_image function omitted since we do not display images.)

# -------------------------------
# Preprocessing Functions
# -------------------------------
def preprocess_image(image, debug=False):
    """Convert image to grayscale and apply adaptive thresholding."""
    if len(image.shape) == 2:
        gray = image
    elif len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        raise ValueError("Unexpected number of channels in input image.")
    # (Image display omitted.)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 15, 9)
    return thresh

# -------------------------------
# Generic OCR Functions
# -------------------------------
def detect_text_regions(thresh_img, debug=False):
    """Detect text regions from thresholded image."""
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    rois.sort(key=lambda b: (b[1], b[0]))
    return rois

def perform_ocr_on_rois(img, rois, debug=False):
    """Perform OCR on each region; returns list of (x,y,w,h,text)."""
    results = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6").strip() or "[BLANK]"
        results.append((x, y, w, h, text))
        logger.debug(f"OCR Box ({x},{y},{w},{h}): {text}")
    return results

# -------------------------------
# DDR Pipeline
# -------------------------------
def process_daily_drilling_report(image_path, debug=True):
    logger.info(f"Processing DDR image from: {image_path}")
    img = safe_read_image_cv2(image_path)
    # Coordinates for the top-right DDR region; adjust as needed.
    coords = (1600, 0, 950, 185)
    debug_path = os.path.join(dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results"), "debug_top_right.png")
    # Draw bounding box and crop (no image display)
    x, y, w, h = coords
    cropped = img[y:y+h, x:x+w]
    # Preprocess using DDR-specific method
    if len(cropped.shape) == 2:
        gray = cropped
    else:
        gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
    equalized = cv2.equalizeHist(gray)
    blurred = cv2.GaussianBlur(equalized, (5,5), 0)
    processed = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                       cv2.THRESH_BINARY, 11, 2)
    ocr_text = pytesseract.image_to_string(processed, config="--psm 6").strip()
    logger.info("DDR OCR extraction complete.")
    logger.debug(f"OCR Text: {ocr_text}")
    expected_keys = ["Report Date", "Report Num", "Rig"]
    # Extract key-value pairs from text.
    combined = " ".join(line.strip() for line in ocr_text.splitlines() if line.strip())
    combined = re.sub(r'\s+', ' ', combined)
    extracted = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = rf'{re.escape(key)}\s*:\s*(.*?)(?=\s*{re.escape(next_key)}\s*:|$)'
        else:
            pattern = rf'{re.escape(key)}\s*:\s*(.*)'
        match = re.search(pattern, combined, re.IGNORECASE)
        extracted[key] = match.group(1).strip() if match and match.group(1).strip() else None
    if extracted.get("Report Num"):
        m = re.search(r'^(\d+)\.?$', extracted["Report Num"])
        extracted["Report Num"] = m.group(1) if m else extracted["Report Num"]
    df = pd.DataFrame(list(extracted.items()), columns=["Key", "Value"])
    return {"DAILY DRILLING REPORT": extracted}, df

# -------------------------------
# WELL/JOB INFORMATION Pipeline
# -------------------------------
def process_well_job_info(section_path, debug=True):
    img = safe_read_image(section_path)
    ocr_text = perform_ocr(img)
    logger.info("Well/Job OCR extraction complete.")
    expected_keys = [
        "Well Name", "Job Name", "Supervisor(s)", "Field", "Sec/Twn/Rng", "Phone",
        "AFE #", "API #", "Email", "Contractor", "Elevation", "RKB",
        "Spud Date", "Days from Spud", "Days on Loc", "MD/TVD", "24 Hr Footage",
        "Present Operations", "Activity Planned"
    ]
    combined = " ".join(line.strip() for line in ocr_text.splitlines() if line.strip())
    combined = re.sub(r'\s+', ' ', combined)
    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = re.escape(key) + r'\s*:\s*(.*?)(?=\s*' + re.escape(next_key) + r'\s*:|$)'
        else:
            pattern = re.escape(key) + r'\s*:\s*(.*)'
        match = re.search(pattern, combined, re.IGNORECASE)
        result[key] = match.group(1).strip() if match else ""
    df = pd.DataFrame(list(result.items()), columns=["Key", "Value"])
    logger.info(f"WELL/JOB DataFrame shape: {df.shape}")
    return {"WELL/JOB INFORMATION": result}, df
    
# -------------------------------
# MUD Pipeline
# -------------------------------
def read_cropped_section_image(section_path):
    local_path = dbfs_to_local_path(section_path)
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise ValueError(f"OpenCV failed to load image: {local_path}")
    return img

def preprocess_image_mud(img, debug=True):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 15, 9)
    return thresh

def detect_text_regions_mud(thresh_img, debug=True):
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    rois.sort(key=lambda b: (b[1], b[0]))
    return rois

def perform_ocr_on_rois_mud(img, rois, debug=True):
    results = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6").strip() or "[BLANK]"
        results.append((x, y, w, h, text))
    return results

def parse_value_row_tokens(expected_headers, tokens):
    expected_token_count = (len(expected_headers) - 1) + 3
    if len(tokens) < expected_token_count:
        tokens += ["[BLANK]"] * (expected_token_count - len(tokens))
    elif len(tokens) > expected_token_count:
        tokens = tokens[:expected_token_count]
    result = {}
    idx = 0
    for header in expected_headers:
        if header == "GELS (10s/10m/30m)":
            gels_tokens = tokens[idx:idx+3]
            result[header] = {"10s": gels_tokens[0], "10m": gels_tokens[1], "30m": gels_tokens[2]}
            idx += 3
        else:
            result[header] = tokens[idx]
            idx += 1
    return result

def build_mud_dict_from_rois(roi_texts, expected_headers):
    row_tolerance = 10
    rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)
    row_strings = []
    for i, row in enumerate(rows):
        row.sort(key=lambda cell: cell[0])
        line = " ".join(cell[4] for cell in row)
        row_strings.append(line)
    # Identify header and data rows (this logic may need adjustment)
    header1_line = None
    value1_line = None
    header2_line = None
    value2_line = None
    for i, r_text in enumerate(row_strings):
        if "type" in r_text.lower() and not header1_line:
            header1_line = r_text
            if i+1 < len(row_strings):
                value1_line = row_strings[i+1]
        elif header1_line and not header2_line and any(kw in r_text.lower() for kw in ["rpm", "mud", "loss", "comments"]):
            header2_line = r_text
            if i+1 < len(row_strings):
                value2_line = row_strings[i+1]
            break
    if value1_line is None:
        logger.error("No data row found for Mud section!")
        return {}
    tokens1 = value1_line.split()
    tokens2 = value2_line.split() if value2_line else []
    combined_tokens = tokens1 + tokens2
    return parse_value_row_tokens(expected_headers, combined_tokens)

def process_mud(image_path, debug=True):
    img = read_cropped_section_image(image_path)
    thresh_img = preprocess_image_mud(img, debug=debug)
    rois = detect_text_regions_mud(thresh_img, debug=debug)
    roi_texts = perform_ocr_on_rois_mud(img, rois, debug=debug)
    expected_headers = [
        "Type", "Weight In", "Weight Out", "pH", "CAKE",
        "GELS (10s/10m/30m)", "Oil/Water", "FV", "ES", "PV",
        "YP", "CL", "Ca", "LGS", "WL", "HTHP Loss", "3 RPM",
        "6 RPM", "Mud Pits and Hole Volume", "24 Hr Loss",
        "Total Loss", "Comments"
    ]
    mud_dict = build_mud_dict_from_rois(roi_texts, expected_headers)
    # If mud_dict is a dict, convert to DataFrame of key-value pairs; else assume list.
    if isinstance(mud_dict, dict):
        df = pd.DataFrame(list(mud_dict.items()), columns=["Key", "Value"])
    else:
        df = pd.DataFrame(mud_dict)
    return {"MUD": mud_dict}, df

# ---------------------------------------------------------------------
# 3) Survey Extraction process
# ---------------------------------------------------------------------
def build_survey_dict_from_rois(roi_texts, expected_headers):
    row_tolerance = 10
    rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)
    
    row_strings = []
    for i, row in enumerate(rows):
        row.sort(key=lambda cell: cell[0])
        line = " ".join(cell[4] for cell in row)
        row_strings.append(line)
        logger.info(f"Grouped Row {i}: {line}")
    
    all_lines = []
    for line in row_strings:
        for subline in line.split("\n"):
            subline = subline.strip()
            if subline:
                all_lines.append(subline)
    logger.info(f"All extracted lines: {all_lines}")
    
    data_lines = []
    for line in all_lines:
        tokens = re.split(r'\s{2,}', line)
        if len(tokens) == 1:
            tokens = line.split()
        lower_tokens = [t.lower() for t in tokens]
        if "md" in lower_tokens and "inclination" in lower_tokens:
            logger.info(f"Skipping header line: {tokens}")
            continue
        if len(tokens) < len(expected_headers):
            logger.warning(f"Line has fewer tokens than expected: {tokens}")
            continue
        tokens = tokens[:len(expected_headers)]
        data_lines.append(tokens)
    
    survey_list = []
    for tokens in data_lines:
        row_dict = {expected_headers[i]: tokens[i] for i in range(len(expected_headers))}
        survey_list.append(row_dict)
    return survey_list

def sort_survey_data(survey_list):
    def md_value(row):
        try:
            return float(row["MD"].replace(",", ""))
        except Exception:
            return 0
    sorted_list = sorted(survey_list, key=md_value, reverse=False)
    filtered_list = [row for row in sorted_list if not row["MD"].upper().startswith("SURVEY")]
    return filtered_list

def process_survey(survey_img_path, debug=True):
    expected_headers = ["MD", "Inclination", "Azimuth", "DLS", "TVD"]
    img = safe_read_image(survey_img_path)
    # if debug:
        # show_image("Original Survey Image", img, size=(12,12))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 9
    )
    # if debug:
        # show_image("Adaptive Threshold", thresh, cmap="gray")
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    rois.sort(key=lambda b: (b[1], b[0]))
    if debug:
        debug_img = cv2.cvtColor(thresh, cv2.COLOR_GRAY2BGR)
        for (x, y, w, h) in rois:
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0,255,0), 2)
    roi_texts = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6").strip()
        if not text:
            text = "[BLANK]"
        roi_texts.append((x, y, w, h, text))
        if debug:
            logger.info(f"OCR Box ({x},{y},{w},{h}): {text}")
    survey_list = build_survey_dict_from_rois(roi_texts, expected_headers)
    survey_list = sort_survey_data(survey_list)
    final_output = {"SURVEY DATA": survey_list}
    df = pd.DataFrame(survey_list)
    return final_output, df

# ---------------------------------------------------------------------
# 4) BOP Extraction process
# ---------------------------------------------------------------------
def process_bop(ocr_text, debug= True):
    pattern = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in pattern.items():
        match = re.search(regex, ocr_text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    return result

def process_bop(section_path, debug=False):
    img = safe_read_image(section_path)
    ocr_text = perform_ocr(img)
    logger.info("BOP OCR extraction complete.")
    patterns = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in patterns.items():
        match = re.search(regex, ocr_text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    df = pd.DataFrame(list(result.items()), columns=["Key", "Value"])
    logger.info(f"BOP DataFrame shape: {df.shape}")
    return {"BOP": result}, df

# ---------------------------------------------------------------------
# 6) BHA Extraction process
# ---------------------------------------------------------------------
def extract_bha_data(image_path):
    image = Image.open(image_path)
    ocr_text = pytesseract.image_to_string(image)
    patterns = {
        "Drill Pipe Detail": r"Drill Pipe Detail:\s*([^\n]+)",
        "Size": r"Size:\s*([\d.]+)\b",
        "Wt./Ft": r"Wt\./Ft:\s*([\d.]+)\b",
        "Connection": r"Connection:\s*([\w\d-]+)\b",
        "ID": r"ID:\s*([\d.]+)\b",
        "Drill Bit": r"Drill Bit:\s*([^\n;]+)",
        "Motor": r"Motor:\s*([^\n;]+)",
        "MWD Tool": r"MWD Tool:\s*([^\n;]+)",
        "Monel Collar": r"Monel Collar:\s*([^\n;]+)",
        "X-Over": r"X-Over:\s*([^\n;]+)",
        "Sub": r"Sub:\s*([^\n;]+)",
        "HWDP": r"HWDP:\s*([^\n;]+)",
        "Drill Pipe": r"Drill Pipe:\s*([\d.]+(?:\" DP)?)",
        "Reamer": r"Reamer:\s*([^\n;]+)",
        "Shock Sub": r"Shock Sub:\s*([^\n;]+)",
        "Total Length": r"Total Length:\s*(\d+)\b"
    }
    bha_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, ocr_text)
        if match:
            bha_data[key] = match.group(1).strip()
    if "Drill Pipe Detail" in bha_data:
        detail = bha_data["Drill Pipe Detail"]
        for remove_key in ["Size", "Wt./Ft", "Connection", "ID"]:
            if remove_key in bha_data:
                detail = re.sub(rf"{remove_key}:\s*{re.escape(bha_data[remove_key])}", "", detail).strip(",; ")
        bha_data["Drill Pipe Detail"] = detail
    structured_data = {
        "BHA": {
            "Drill Pipe Detail": bha_data.get("Drill Pipe Detail", ""),
            "Size": bha_data.get("Size", ""),
            "Wt./Ft": bha_data.get("Wt./Ft", ""),
            "Connection": bha_data.get("Connection", ""),
            "ID": bha_data.get("ID", ""),
            "BHA #4": {
                "Drill Bit": bha_data.get("Drill Bit", ""),
                "Motor": bha_data.get("Motor", ""),
                "MWD Tool": bha_data.get("MWD Tool", ""),
                "Monel Collar": bha_data.get("Monel Collar", ""),
                "X-Over": bha_data.get("X-Over", ""),
                "Sub": bha_data.get("Sub", ""),
                "HWDP": bha_data.get("HWDP", ""),
                "Drill Pipe": bha_data.get("Drill Pipe", ""),
                "Reamer": bha_data.get("Reamer", ""),
                "Shock Sub": bha_data.get("Shock Sub", "")
            },
            "Total Length": bha_data.get("Total Length", "")
        }
    }
    return structured_data

def process_bha(bha_img_path, debug=True):
    bha_json = extract_bha_data(bha_img_path)
    df = pd.json_normalize(bha_json["BHA"])
    return {"BHA": bha_json["BHA"]}, df

# -------------------------------
# BHA Pipeline
# -------------------------------
def process_bha(image_path, debug=True):
    bha_data = extract_bha_data(image_path)
    df = pd.DataFrame([bha_data.get("BHA", {})])
    return {"BHA": bha_data.get("BHA", {})}, df

def extract_bha_data(image_path):
    image = safe_read_image_pil(image_path)
    ocr_text = pytesseract.image_to_string(image)
    patterns = {
        "Drill Pipe Detail": r"Drill Pipe Detail:\s*([^\n]+)",
        "Size": r"Size:\s*([\d.]+)\b",
        "Wt./Ft": r"Wt\./Ft:\s*([\d.]+)\b",
        "Connection": r"Connection:\s*([\w\d-]+)\b",
        "ID": r"ID:\s*([\d.]+)\b",
        "Drill Bit": r"Drill Bit:\s*([^\n;]+)",
        "Motor": r"Motor:\s*([^\n;]+)",
        "MWD Tool": r"MWD Tool:\s*([^\n;]+)",
        "Monel Collar": r"Monel Collar:\s*([^\n;]+)",
        "X-Over": r"X-Over:\s*([^\n;]+)",
        "Sub": r"Sub:\s*([^\n;]+)",
        "HWDP": r"HWDP:\s*([^\n;]+)",
        "Drill Pipe": r"Drill Pipe:\s*([\d.]+(?:\" DP)?)",
        "Reamer": r"Reamer:\s*([^\n;]+)",
        "Shock Sub": r"Shock Sub:\s*([^\n;]+)",
        "Total Length": r"Total Length:\s*(\d+)\b"
    }
    bha_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, ocr_text)
        if match:
            bha_data[key] = match.group(1).strip()
    if "Drill Pipe Detail" in bha_data:
        detail = bha_data["Drill Pipe Detail"]
        for remove_key in ["Size", "Wt./Ft", "Connection", "ID"]:
            if remove_key in bha_data:
                detail = re.sub(rf"{remove_key}:\s*{re.escape(bha_data[remove_key])}", "", detail).strip(",; ")
        bha_data["Drill Pipe Detail"] = detail
    structured_data = {
        "BHA": {
            "Drill Pipe Detail": bha_data.get("Drill Pipe Detail", ""),
            "Size": bha_data.get("Size", ""),
            "Wt./Ft": bha_data.get("Wt./Ft", ""),
            "Connection": bha_data.get("Connection", ""),
            "ID": bha_data.get("ID", ""),
            "BHA #4": {
                "Drill Bit": bha_data.get("Drill Bit", ""),
                "Motor": bha_data.get("Motor", ""),
                "MWD Tool": bha_data.get("MWD Tool", ""),
                "Monel Collar": bha_data.get("Monel Collar", ""),
                "X-Over": bha_data.get("X-Over", ""),
                "Sub": bha_data.get("Sub", ""),
                "HWDP": bha_data.get("HWDP", ""),
                "Drill Pipe": bha_data.get("Drill Pipe", ""),
                "Reamer": bha_data.get("Reamer", ""),
                "Shock Sub": bha_data.get("Shock Sub", "")
            },
            "Total Length": bha_data.get("Total Length", "")
        }
    }
    return structured_data

# -------------------------------
# Bit Info Pipeline
# -------------------------------
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def adaptive_threshold(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 15, 9)

def detect_text_regions_bit(image, debug=False):
    contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    rois.sort(key=lambda b: (b[1], b[0]))
    return rois

def perform_ocr_on_rois_bit(img, rois, debug=False):
    results = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6").strip() or "[BLANK]"
        results.append((x, y, w, h, text))
        logger.debug(f"Bit Info OCR Box ({x},{y},{w},{h}): {text}")
    return results

def build_bit_info_dict_from_rois(roi_texts, debug=False):
    row_tolerance = 10
    grouped_rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            grouped_rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        grouped_rows.append(current_row)
    row_strings = []
    for i, row in enumerate(grouped_rows):
        row.sort(key=lambda cell: cell[0])
        line = " ".join(cell[4] for cell in row).replace("\n", " ").strip()
        row_strings.append(line)
        logger.info(f"Bit Info Row {i}: {line}")
    if len(row_strings) < 3:
        logger.warning("Not enough rows found for Bit Info layout.")
        return [], pd.DataFrame()
    data_lines = row_strings[3:]
    final_columns = [
        "Bit #", "Size", "Make", "Model", "Serial #",
        "Nozzle-(Number x Size)", "Nozzle-TFA",
        "Depth-In", "Depth-Out", "Depth-Feet", "Depth-ROP",
        "Hours-Total", "Hours-On Btm",
        "Dull Grade-I", "Dull Grade-O1", "Dull Grade-D", "Dull Grade-L", 
        "Dull Grade-B", "Dull Grade-G", "Dull Grade-O2", "Dull Grade-RP"
    ]
    structured_data = []
    for line in data_lines:
        tokens = line.split()
        if len(tokens) < 21:
            tokens += [""] * (21 - len(tokens))
        elif len(tokens) > 21:
            tokens = tokens[:21]
        row_dict = {final_columns[i]: tokens[i] for i in range(21)}
        structured_data.append(row_dict)
        logger.info(f"Bit Info Parsed row: {row_dict}")
    df = pd.DataFrame(structured_data)
    return structured_data, df

def process_bit_info(image_path, debug=False):
    img = safe_read_image_cv2(image_path)
    gray = get_grayscale(img)
    thresh = adaptive_threshold(gray)
    rois = detect_text_regions_bit(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois_bit(img, rois, debug=debug)
    bit_info_data, df = build_bit_info_dict_from_rois(roi_texts, debug=debug)
    return {"BIT INFO": bit_info_data}, df

# -------------------------------
# DAILY NUMBERS: OBSERVATION & INTERVENTION Pipeline
# -------------------------------
def process_obs_int(image_path, debug=False):
    img = safe_read_image_cv2(image_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    # Collect texts; filter out header words.
    types_list = []
    numbers_list = []
    for (_, _, _, _, text) in roi_texts:
        clean = text.strip()
        if clean.lower() in ["daily numbers: observation & intervention", "number", "[blank]"]:
            continue
        # If text can be converted to a number, assume it is a number.
        try:
            float(clean)
            numbers_list.append(clean)
        except ValueError:
            types_list.append(clean)
    # Use the maximum count from either list.
    count = max(len(types_list), len(numbers_list))
    # Pad lists if necessary.
    while len(types_list) < count:
        types_list.append("")
    while len(numbers_list) < count:
        numbers_list.append("")
    structured = [{"Type": types_list[i], "Number": numbers_list[i]} for i in range(count)]
    df = pd.DataFrame(structured)
    return {"DAILY NUMBERS: OBSERVATION & INTERVENTION": structured}, df

# -------------------------------
# Aggregator Main Process
# -------------------------------
def main():
    debug = True  # Set True for verbose logging.
    
    # Define image paths for all sections.
    image_paths = {
        "DAILY DRILLING REPORT": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_1.png",
        "WELL/JOB INFORMATION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png",
        "MUD": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png",
        "SURVEY DATA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png",
        "DIR INFO": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png",
        "DRILL BITS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png",
        "CASING": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_7.png",
        "BOP": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png",
        "PERSONNEL": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_9.png",
        "DAILY NUMBERS: OBSERVATION & INTERVENTION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png",
        "BHA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_11.png",
        "PUMPS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png",
        "COST DATA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png",
        "TIME BREAKDOWN": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_14.png",
        "CONSUMABLES": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_2_section_2.png"
    }
    
    # Define output folder.
    output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
    os.makedirs(output_folder, exist_ok=True)
    
    aggregated_json = {}
    aggregated_df = pd.DataFrame()
    
    # Define processing order as tuples: (Section Name, processing function, image path)
    processes = [
        ("DAILY DRILLING REPORT", process_daily_drilling_report, image_paths.get("DAILY DRILLING REPORT")),
        ("WELL/JOB INFORMATION", process_well_job_info, image_paths.get("WELL/JOB INFORMATION")),
        ("MUD", process_mud, image_paths.get("MUD")),
        ("SURVEY DATA", process_survey, image_paths.get("SURVEY DATA")),
        ("DIR INFO", process_dir_info, image_paths.get("DIR INFO")),
        ("DRILL BITS", process_drill_bits, image_paths.get("DRILL BITS")),
        ("CASING", process_casing, image_paths.get("CASING")),
        ("BOP", process_bop, image_paths.get("BOP")),
        ("PERSONNEL", process_personnel, image_paths.get("PERSONNEL")),
        ("DAILY NUMBERS: OBSERVATION & INTERVENTION", process_obs_int, image_paths.get("DAILY NUMBERS: OBSERVATION & INTERVENTION")),
        ("BHA", process_bha, image_paths.get("BHA")),
        ("PUMPS", process_pumps, image_paths.get("PUMPS")),
        ("COST DATA", process_cost_data, image_paths.get("COST DATA")),
        ("TIME BREAKDOWN", process_time_breakdown, image_paths.get("TIME BREAKDOWN")),
        ("CONSUMABLES", process_consumables, image_paths.get("CONSUMABLES"))
    ]
    
    for section, func, img_path in processes:
        try:
            logger.info(f"Processing section: {section}")
            data_json, df = func(img_path, debug)
            safe_section = sanitize_section_name(section)
            aggregated_json[section] = data_json.get(section, data_json)
            aggregated_df = pd.concat([aggregated_df, df], ignore_index=True)
            logger.info(f"{section} output:\n{json.dumps(data_json, indent=4)}")
            section_json_file = os.path.join(output_folder, f"{safe_section}.json")
            with open(section_json_file, "w") as f:
                json.dump(data_json, f, indent=4)
            section_csv_file = os.path.join(output_folder, f"{safe_section}.csv")
            df.to_csv(section_csv_file, index=False)
            logger.info(f"Saved {section} outputs to JSON and CSV.")
        except Exception as e:
            logger.error(f"{section} processing failed: {e}")
    
    agg_json_path = os.path.join(output_folder, "aggregated_data.json")
    with open(agg_json_path, "w") as f:
        json.dump(aggregated_json, f, indent=4)
    agg_csv_path = os.path.join(output_folder, "aggregated_data.csv")
    aggregated_df.to_csv(agg_csv_path, index=False)
    logger.info(f"Aggregated outputs saved to {agg_json_path} and {agg_csv_path}.")
    print("----- Aggregated JSON Output -----")
    print(json.dumps(aggregated_json, indent=4))

if __name__ == "__main__":
    main()


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-6103765517986980>, line 733[0m
[1;32m    730[0m     [38;5;28mprint[39m(json[38;5;241m.[39mdumps(aggregated_json, indent[38;5;241m=[39m[38;5;241m4[39m))
[1;32m    732[0m [38;5;28;01mif[39;00m [38;5;18m__name__[39m [38;5;241m==[39m [38;5;124m"[39m[38;5;124m__main__[39m[38;5;124m"[39m:
[0;32m--> 733[0m     main()

File [0;32m<command-6103765517986980>, line 701[0m, in [0;36mmain[0;34m()[0m
[1;32m    685[0m aggregated_df [38;5;241m=[39m pd[38;5;241m.[39mDataFrame()
[1;32m    687[0m [38;5;66;03m# Define processing order as tuples: (Section Name, processing function, image path)[39;00m
[1;32m    688[0m processes [38;5;241m=[39m [
[1;32m    689[0m     ([38;5;124m"[39m[38;5;124mDAILY DRILLING REPORT[39m[38;5;124m"[39m, process_daily_drilling_report,