In [0]:
#%run ./init

In [0]:
import re
import cv2
import numpy as np
import pandas as pd
import logging
import json
import os
import pytesseract
import pandas as pd
import logging
import json
from PIL import Image

In [0]:
# ---------------------------------------------------------------------
# Logger Setup
# ---------------------------------------------------------------------
logger = logging.getLogger("UnifiedExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# ---------------------------------------------------------------------
# Utility Functions
# ---------------------------------------------------------------------
def dbfs_to_local_path(dbfs_path):
    """Convert a DBFS URI to a local path."""
    if dbfs_path.startswith("dbfs:/"):
        return "/dbfs/" + dbfs_path[len("dbfs:/"):]
    return dbfs_path


In [0]:
def read_cropped_section_image(section_path):
    """Read an image from a cropped section (handles DBFS paths)."""
    local_path = section_path
    if local_path.startswith("dbfs:"):
        local_path = local_path.replace("dbfs:", "")
    if local_path.startswith("/mnt/"):
        local_path = "/dbfs" + local_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise FileNotFoundError(f"OpenCV failed to load image: {local_path}")
    logger.info(f"Image loaded from {local_path} with shape {img.shape}")
    return img

In [0]:
# -----------------------------------------------------------------------------
# COMMON UTILITY FUNCTIONS
# -----------------------------------------------------------------------------
def safe_read_image(img_path):
    local_path = img_path if not img_path.startswith("dbfs:") else img_path.replace("dbfs:", "/dbfs")
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise ValueError(f"Failed to load image: {local_path}")
    logger.info(f"Image loaded from {local_path} with shape {img.shape}")
    return img

def preprocess_image(img, debug=False):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 15, 9)
    if debug:
        logger.info("Preprocessing completed (grayscale and threshold applied).")
    return thresh

def detect_text_regions(thresh_img, debug=False):
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    rois.sort(key=lambda b: (b[1], b[0]))
    if debug:
        logger.info(f"Detected {len(rois)} text regions.")
    return rois

def perform_ocr_on_rois(img, rois, debug=False):
    results = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6").strip() or "[BLANK]"
        results.append((x, y, w, h, text))
        if debug:
            logger.info(f"OCR Box ({x},{y},{w},{h}): {text}")
    return results

def perform_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return pytesseract.image_to_string(gray, config="--psm 6")

# def perform_ocr(img):
#     """
#     Performs OCR on the given PIL image using pytesseract.
#     Returns raw text.
#     """
#     text = pytesseract.image_to_string(img)
#     logger.info("OCR extraction complete.")
#     return text

def extract_key_value_from_text(text, expected_keys):
    combined = " ".join(line.strip() for line in text.splitlines() if line.strip())
    combined = re.sub(r'\s+', ' ', combined)
    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = re.escape(key) + r'\s*:\s*(.*?)(?=\s*' + re.escape(next_key) + r'\s*:|$)'
        else:
            pattern = re.escape(key) + r'\s*:\s*(.*)'
        match = re.search(pattern, combined, re.IGNORECASE)
        result[key] = match.group(1).strip() if match and match.group(1).strip() else None
    return result


In [0]:
# -----------------------------------------------------------------------------
# PARSING HELPERS
# -----------------------------------------------------------------------------
def group_ocr_results(roi_texts, row_tolerance=10):
    rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)
    row_strings = []
    for row in rows:
        row.sort(key=lambda c: c[0])
        line = " ".join(cell[4] for cell in row)
        row_strings.append(line)
    return row_strings

In [0]:
def build_casing_dict_from_rois(roi_texts, expected_headers, debug=False):
    row_strings = group_ocr_results(roi_texts)
    all_lines = []
    for line in row_strings:
        for sub in line.split("\n"):
            sub = sub.strip()
            if sub:
                all_lines.append(sub)
    data_lines = []
    for line in all_lines:
        tokens = re.split(r'\s{2,}', line)
        if len(tokens) == 1:
            tokens = line.split()
        lower_tokens = [t.lower() for t in tokens]
        if "type" in lower_tokens and "size" in lower_tokens:
            logger.info(f"CASING - Skipping header line: {tokens}")
            continue
        if len(tokens) < len(expected_headers):
            logger.warning(f"CASING - Line has fewer tokens than expected: {tokens}")
            continue
        tokens = tokens[:len(expected_headers)]
        data_lines.append(tokens)
    casing_list = [{expected_headers[i]: tokens[i] for i in range(len(expected_headers))}
                   for tokens in data_lines]
    return casing_list

def process_casing_data(debug=False):
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_7.png"
    img = safe_read_image(section_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    expected_headers = ["Type", "Size", "Weight", "Grade", "Connection", "Top MD", "Bottom MD", "TOC"]
    casing_list = build_casing_dict_from_rois(roi_texts, expected_headers, debug=debug)
    df = pd.DataFrame(casing_list)
    logger.info(f"CASING DataFrame shape: {df.shape}")
    return {"CASING": casing_list}, df

In [0]:
# -----------------------------------------------------------------------------
# process FUNCTIONS
# -----------------------------------------------------------------------------
def process_cost_data(debug=False):
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png"
    img = safe_read_image(section_path)
    ocr_text = perform_ocr(img)
    logger.info("Cost OCR extraction complete.")
    expected_keys = [
        "Drilling AFE Amount", "Daily Drilling Cost", "Cumulative Drilling Cost",
        "Cumulative Well Cost", "Daily Mud Cost", "Cumulative Mud Cost"
    ]
    extracted = extract_key_value_from_text(ocr_text, expected_keys)
    df = pd.DataFrame(list(extracted.items()), columns=["Key", "Value"])
    logger.info(f"COST DataFrame shape: {df.shape}")
    return {"COST DATA": extracted}, df

In [0]:
def process_well_job_info(section_path, debug=False):
    img = safe_read_image(section_path)
    ocr_text = perform_ocr(img)
    logger.info("Well/Job OCR extraction complete.")
    expected_keys = [
        "Well Name", "Job Name", "Supervisor(s)", "Field", "Sec/Twn/Rng", "Phone",
        "AFE #", "API #", "Email", "Contractor", "Elevation", "RKB",
        "Spud Date", "Days from Spud", "Days on Loc", "MD/TVD", "24 Hr Footage",
        "Present Operations", "Activity Planned"
    ]
    combined = " ".join(line.strip() for line in ocr_text.splitlines() if line.strip())
    combined = re.sub(r'\s+', ' ', combined)
    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = re.escape(key) + r'\s*:\s*(.*?)(?=\s*' + re.escape(next_key) + r'\s*:|$)'
        else:
            pattern = re.escape(key) + r'\s*:\s*(.*)'
        match = re.search(pattern, combined, re.IGNORECASE)
        result[key] = match.group(1).strip() if match else ""
    df = pd.DataFrame(list(result.items()), columns=["Key", "Value"])
    logger.info(f"WELL/JOB DataFrame shape: {df.shape}")
    return {"WELL/JOB INFORMATION": result}, df

In [0]:
def process_obs_int(section_path, debug=False):
    img = safe_read_image(section_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    header_str = "daily numbers: observation & intervention"
    all_texts = [t[4] for t in roi_texts]
    types_list, numbers_list = [], []
    for txt in all_texts:
        clean = txt.strip()
        if clean.lower() in [header_str, "number", "[blank]"]:
            continue
        try:
            float(clean)
            numbers_list.append("" if clean.lower() == "[blank]" else clean)
            continue
        except ValueError:
            pass
        if "\n" in clean:
            for line in clean.splitlines():
                line = line.strip()
                if line and line.lower() != "[blank]":
                    types_list.append(line)
        else:
            types_list.append(clean)
    expected_count = 5
    while len(numbers_list) < expected_count:
        numbers_list.append("")
    types_list = types_list[:expected_count]
    numbers_list = numbers_list[:expected_count]
    structured = [{"Type": types_list[i], "Number": numbers_list[i]} for i in range(expected_count)]
    df = pd.DataFrame(structured)
    logger.info(f"DAILY NUMBERS: OBSERVATION & INTERVENTION DataFrame shape: {df.shape}")
    return {"DAILY NUMBERS: OBSERVATION & INTERVENTION": structured}, df


In [0]:
def process_bop(section_path, debug=False):
    img = safe_read_image(section_path)
    ocr_text = perform_ocr(img)
    logger.info("BOP OCR extraction complete.")
    patterns = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in patterns.items():
        match = re.search(regex, ocr_text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    df = pd.DataFrame(list(result.items()), columns=["Key", "Value"])
    logger.info(f"BOP DataFrame shape: {df.shape}")
    return {"BOP": result}, df

In [0]:
def build_dir_info_dict_from_rois(roi_texts, debug=False):
    all_texts = [t[4] for t in roi_texts]
    daily_cum_idx = next((i for i, txt in enumerate(all_texts)
                           if "daily" in txt.lower() and "cumulative" in txt.lower()), None)
    if daily_cum_idx is None:
        logger.warning("Could not find 'Daily Cumulative' bounding box.")
        return {}, pd.DataFrame()
    cat_idx = daily_cum_idx + 1
    if cat_idx >= len(all_texts):
        logger.warning("No bounding box after 'Daily Cumulative'.")
        return {}, pd.DataFrame()
    categories_box = all_texts[cat_idx]
    lines = [ln.strip() for ln in categories_box.split("\n") if ln.strip()]
    if len(lines) < 5:
        logger.warning(f"Expected 5 category lines, got {len(lines)}: {lines}")
    def safe_get(idx):
        return all_texts[idx] if 0 <= idx < len(all_texts) else ""
    structured = []
    for i in range(4):
        cat_name = lines[i] if i < len(lines) else f"Unknown Category {i+1}"
        daily_box = safe_get(cat_idx + 1 + (i * 2))
        cum_box = safe_get(cat_idx + 2 + (i * 2))
        structured.append({
            "Category": cat_name,
            "Daily": "" if daily_box == "[BLANK]" else daily_box,
            "Cumulative": "" if cum_box == "[BLANK]" else cum_box
        })
    last_box = safe_get(cat_idx + 9)
    last_cat = lines[4] if len(lines) >= 5 else "Rotating Footage"
    remainder = last_box.replace(last_cat, "").strip()
    tokens = remainder.split()
    daily_val = tokens[0] if len(tokens) >= 2 else ""
    cum_val = tokens[1] if len(tokens) >= 2 else ""
    structured.append({
        "Category": last_cat,
        "Daily": "" if daily_val == "[BLANK]" else daily_val,
        "Cumulative": "" if cum_val == "[BLANK]" else cum_val
    })
    df = pd.DataFrame(structured)
    logger.info(f"DIR INFO DataFrame shape: {df.shape}")
    return {"DIR INFO": structured}, df

def process_dir_info(section_path, debug=False):
    img = safe_read_image(section_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    return build_dir_info_dict_from_rois(roi_texts, debug=debug)

In [0]:
def build_consumables_dict_from_rois(roi_texts, debug=False):
    row_strings = group_ocr_results(roi_texts)
    data_rows = []
    for line in row_strings:
        lower_line = line.lower()
        if ("consumable" in lower_line and "received" in lower_line) or "nun" in lower_line:
            continue
        if len(line.split()) < 5:
            continue
        data_rows.append(line)
    consumables_list = []
    for line in data_rows:
        tokens = re.split(r'\s+', line)
        if len(tokens) > 5:
            first = " ".join(tokens[:-4])
            tokens = [first] + tokens[-4:]
        if len(tokens) != 5:
            logger.warning(f"CONSUMABLES - Skipping row (unexpected token count): {tokens}")
            continue
        consumables_list.append({
            "Consumable": tokens[0],
            "Daily Received (gal)": tokens[1],
            "Daily Used (gal)": tokens[2],
            "Cumulative Used (gal)": tokens[3],
            "Daily on Hand (gal)": tokens[4]
        })
    return consumables_list

def process_consumables_data(debug=False):
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_2_section_2.png"
    img = safe_read_image(section_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    consumables_list = build_consumables_dict_from_rois(roi_texts, debug=debug)
    df = pd.DataFrame(consumables_list)
    logger.info(f"CONSUMABLES DataFrame shape: {df.shape}")
    return {"CONSUMABLES": consumables_list}, df


In [0]:
def extract_bha_data(image_path, debug=False):
    image = Image.open(image_path)
    ocr_text = pytesseract.image_to_string(image)
    patterns = {
        "Drill Pipe Detail": r"Drill Pipe Detail:\s*([^\n]+)",
        "Size": r"Size:\s*([\d.]+)\b",
        "Wt./Ft": r"Wt\./Ft:\s*([\d.]+)\b",
        "Connection": r"Connection:\s*([\w\d-]+)\b",
        "ID": r"ID:\s*([\d.]+)\b",
        "Drill Bit": r"Drill Bit:\s*([^\n;]+)",
        "Motor": r"Motor:\s*([^\n;]+)",
        "MWD Tool": r"MWD Tool:\s*([^\n;]+)",
        "Monel Collar": r"Monel Collar:\s*([^\n;]+)",
        "X-Over": r"X-Over:\s*([^\n;]+)",
        "Sub": r"Sub:\s*([^\n;]+)",
        "HWDP": r"HWDP:\s*([^\n;]+)",
        "Drill Pipe": r"Drill Pipe:\s*([\d.]+(?:\" DP)?)",
        "Reamer": r"Reamer:\s*([^\n;]+)",
        "Shock Sub": r"Shock Sub:\s*([^\n;]+)",
        "Total Length": r"Total Length:\s*(\d+)\b"
    }
    bha_data = {}
    for key, pat in patterns.items():
        match = re.search(pat, ocr_text)
        if match:
            bha_data[key] = match.group(1).strip()
    if "Drill Pipe Detail" in bha_data:
        detail = bha_data["Drill Pipe Detail"]
        for remove_key in ["Size", "Wt./Ft", "Connection", "ID"]:
            if remove_key in bha_data:
                detail = re.sub(rf"{remove_key}:\s*{re.escape(bha_data[remove_key])}", "", detail).strip(",; ")
        bha_data["Drill Pipe Detail"] = detail
    structured_data = {
        "BHA": {
            "Drill Pipe Detail": bha_data.get("Drill Pipe Detail", ""),
            "Size": bha_data.get("Size", ""),
            "Wt./Ft": bha_data.get("Wt./Ft", ""),
            "Connection": bha_data.get("Connection", ""),
            "ID": bha_data.get("ID", ""),
            "BHA #4": {
                "Drill Bit": bha_data.get("Drill Bit", ""),
                "Motor": bha_data.get("Motor", ""),
                "MWD Tool": bha_data.get("MWD Tool", ""),
                "Monel Collar": bha_data.get("Monel Collar", ""),
                "X-Over": bha_data.get("X-Over", ""),
                "Sub": bha_data.get("Sub", ""),
                "HWDP": bha_data.get("HWDP", ""),
                "Drill Pipe": bha_data.get("Drill Pipe", ""),
                "Reamer": bha_data.get("Reamer", ""),
                "Shock Sub": bha_data.get("Shock Sub", "")
            },
            "Total Length": bha_data.get("Total Length", "")
        }
    }
    if debug:
        logger.info("Extracted BHA data:")
        logger.info(json.dumps(structured_data, indent=4))
    return structured_data

def process_bha_data(debug=False):
    image_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_2_section_11.png"  # Adjust as needed
    bha_json = extract_bha_data(image_path, debug=debug)
    img = safe_read_image(image_path)
    ocr_text = perform_ocr(img)
    pump_data = parse_pumps_table(ocr_text)
    circ_data = parse_drilling_circ_rates(ocr_text)
    pumps_df = pd.DataFrame(pump_data)
    circ_df = pd.DataFrame(circ_data)
    bha_df = pd.DataFrame([bha_json])
    logger.info(f"BHA DataFrame shape: {bha_df.shape}")
    combined = {"BHA": bha_json, "Pumps": pump_data, "DrillingCircRates": circ_data}
    return combined, pumps_df, circ_df, bha_df

In [0]:
def process_mud_data(debug=False):
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png"
    img = safe_read_image(section_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    expected_headers = [
        "Type", "Weight In", "Weight Out", "pH", "CAKE",
        "GELS (10s/10m/30m)", "Oil/Water", "FV", "ES", "PV",
        "YP", "CL", "Ca", "LGS", "WL", "HTHP Loss", "3 RPM",
        "6 RPM", "Mud Pits and Hole Volume", "24 Hr Loss",
        "Total Loss", "Comments"
    ]
    mud_dict = build_mud_dict_from_rois(roi_texts, expected_headers)
    return {"MUD": mud_dict}, pd.DataFrame(list(mud_dict.items()), columns=["Key", "Value"])

# ---------------------------------------------------------------------
# parse_value_row_tokens for mud section
# ---------------------------------------------------------------------
def parse_value_row_tokens(expected_headers, tokens):
    """
    Map a flat list of tokens to the expected headers.
    For "GELS (10s/10m/30m)", consume 3 tokens and create a sub-dictionary.
    Expected token count = (number of headers - 1) + 3.
    """
    expected_token_count = (len(expected_headers) - 1) + 3
    logger.info(f"Expected token count: {expected_token_count}, tokens extracted: {tokens}")
    
    # Pad or trim tokens as needed.
    if len(tokens) < expected_token_count:
        tokens += ["[BLANK]"] * (expected_token_count - len(tokens))
        logger.warning("Not enough tokens. Padding with [BLANK].")
    elif len(tokens) > expected_token_count:
        tokens = tokens[:expected_token_count]
        logger.warning("Too many tokens. Trimming the extra tokens.")
    
    result = {}
    idx = 0
    for header in expected_headers:
        if header == "GELS (10s/10m/30m)":
            gels_tokens = tokens[idx:idx+3]
            result[header] = {
                "10s": gels_tokens[0],
                "10m": gels_tokens[1],
                "30m": gels_tokens[2]
            }
            idx += 3
        else:
            result[header] = tokens[idx]
            idx += 1
    logger.info(f"Mapped dictionary: {result}")
    return result

# ---------------------------------------------------------------------
# build_mud_dict_from_rois
# ---------------------------------------------------------------------
def build_mud_dict_from_rois(roi_texts, expected_headers):
    """
    Group OCR results into rows based on the y coordinate.
    Identify header rows and corresponding data rows.
    
    In our case, we expect:
      - A header row (with labels) followed by a data row,
      - Then a second header row (for the remaining fields) followed by a second data row.
    
    We then combine the two data rows' tokens and map them to expected_headers.
    """
    row_tolerance = 10
    rows = []
    current_row = []
    prev_y = None

    # Group by row based on y coordinate.
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)

    # Sort each row by x coordinate and log its text.
    row_strings = []
    for i, row_cells in enumerate(rows):
        row_cells.sort(key=lambda c: c[0])
        line_text = " ".join(cell[4] for cell in row_cells)
        row_strings.append(line_text)
        logger.info(f"Row {i} text: {line_text}")

    # Based on OCR output expectations:
    # Row 1: header row 1 (first set of labels)
    # Row 2: data row 1 (first set of values)
    # Row 3: header row 2 (remaining labels)
    # Row 4: data row 2 (remaining values)
    header1_line = None
    value1_line = None
    header2_line = None
    value2_line = None

    for i, r_text in enumerate(row_strings):
        if "Type" in r_text and not header1_line:
            header1_line = r_text
            if i + 1 < len(row_strings):
                value1_line = row_strings[i+1]
        elif header1_line and not header2_line and any(kw in r_text for kw in ["RPM", "Mud", "Loss", "Comments"]):
            header2_line = r_text
            if i + 1 < len(row_strings):
                value2_line = row_strings[i+1]
            break

    logger.info(f"Header1: {header1_line}")
    logger.info(f"Value1: {value1_line}")
    logger.info(f"Header2: {header2_line}")
    logger.info(f"Value2: {value2_line}")

    if value1_line is None:
        logger.error("No data row found for header1!")
        return {}

    # Split the data rows into tokens.
    tokens1 = value1_line.split()
    tokens2 = value2_line.split() if value2_line else []
    logger.info(f"Tokens from data row 1: {tokens1}")
    logger.info(f"Tokens from data row 2: {tokens2}")

    # Combine tokens from both data rows.
    combined_tokens = tokens1 + tokens2
    logger.info(f"Combined tokens: {combined_tokens}")

    # Map the tokens to the expected headers.
    return parse_value_row_tokens(expected_headers, combined_tokens)

In [0]:
def build_survey_dict_from_rois(roi_texts, expected_headers):
    row_strings = group_ocr_results(roi_texts)
    logger.info(f"SURVEY - Grouped Rows: {row_strings}")
    all_lines = []
    for line in row_strings:
        for subline in line.split("\n"):
            subline = subline.strip()
            if subline:
                all_lines.append(subline)
    logger.info(f"SURVEY - All extracted lines: {all_lines}")
    data_lines = []
    for line in all_lines:
        tokens = re.split(r'\s{2,}', line)
        if len(tokens) == 1:
            tokens = line.split()
        lower_tokens = [t.lower() for t in tokens]
        if "md" in lower_tokens and "inclination" in lower_tokens:
            logger.info(f"SURVEY - Skipping header line: {tokens}")
            continue
        if len(tokens) < len(expected_headers):
            logger.warning(f"SURVEY - Line has fewer tokens than expected: {tokens}")
            continue
        tokens = tokens[:len(expected_headers)]
        data_lines.append(tokens)
    logger.info(f"SURVEY - Data lines to parse: {data_lines}")
    survey_list = [{expected_headers[i]: tokens[i] for i in range(len(expected_headers))}
                   for tokens in data_lines]
    return survey_list

def sort_survey_data(survey_list):
    def md_value(row):
        try:
            return float(row["MD"].replace(",", ""))
        except Exception:
            return 0
    return sorted(survey_list, key=md_value, reverse=True)

def process_survey_data(debug=False):
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png"
    img = safe_read_image(section_path)
    thresh = preprocess_image(img, debug=debug)
    rois = detect_text_regions(thresh, debug=debug)
    roi_texts = perform_ocr_on_rois(img, rois, debug=debug)
    expected_headers = ["MD", "Inclination", "Azimuth", "DLS", "TVD"]
    survey_list = build_survey_dict_from_rois(roi_texts, expected_headers)
    survey_list = sort_survey_data(survey_list)
    df = pd.DataFrame(survey_list)
    logger.info(f"SURVEY DataFrame shape: {df.shape}")
    return {"SURVEY": survey_list}, df


In [0]:
# ---------------------------------------------------------------------
# Pumps Extraction process
# ---------------------------------------------------------------------

def parse_pumps_table(ocr_text):
    """
    Parses the pumps table from OCR text using a regex.
    Expected format: Number BOMCO TRIPLEX [HHP] Efficiency Stroke(in) Liner(in) P-Rating P-Limit SPM_Rating SPM_Limit
    """
    pump_pattern = re.compile(
        r"^(\d+)?\s*"               # Number (optional)
        r"(BOMCO)\s+(TRIPLEX)\s+"    # Model, Type
        r"(\d+)?\s*"                # HHP (optional)
        r"(\d+)\s+"                 # Efficiency
        r"([\d.]+)\s+"              # Stroke\(in\)
        r"([\d.]+)\s+"              # Liner\(in\)
        r"(\d+)\s+"                 # P-Rating\(psi\)
        r"(\d+)\s+"                 # P-Limit\(psi\)
        r"(\d+)\s+"                 # SPM Rating
        r"(\d+)\s*$",               # SPM Limit
        re.IGNORECASE
    )
    pumps = []
    lines = ocr_text.splitlines()
    for line in lines:
        line = line.strip()
        match = pump_pattern.match(line)
        if match:
            number, model, pump_type, hhp, efficiency, stroke, liner, p_rating, p_limit, spm_rating, spm_limit = match.groups()
            pumps.append({
                "Number": number if number else "",
                "Model": model,
                "Type": pump_type,
                "HHP": hhp if hhp else "",
                "Efficiency": efficiency,
                "Stroke(in)": stroke,
                "Liner(in)": liner,
                "P-Rating(psi)": p_rating,
                "P-Limit(psi)": p_limit,
                "SPM Rating": spm_rating,
                "SPM Limit": spm_limit
            })
    return pumps

def parse_drilling_circ_rates(ocr_text):
    """
    Parses drilling/circ rates from OCR text.
    This version splits the text into segments starting with "Drilling/Circ Rate <n>"
    then combines the lines in each segment and applies a regex with DOTALL.
    """
    circ_rates = []
    
    # Split the OCR text into segments where each segment begins with "Drilling/Circ Rate" followed by a digit
    segments = re.split(r"(?=Drilling/Circ Rate \d+)", ocr_text)
    
    # Regex pattern to capture the numbers:
    # Group 1: Rate ID (the number after "Drilling/Circ Rate")
    # Group 2: Pressure (number preceding "PS!" or "PSI")
    # Group 3: SPM value (number after "@")
    # Group 4: Gal/Stoke value
    # Group 5: GPM value
    # Group 6: BPM value
    # Group 7: DC value
    # Group 8: DP value
    pattern = re.compile(
        r"Drilling/Circ Rate\s+(\d+).*?"       # Rate ID
        r"(\d+)\s+PS[!I].*?"                   # Pressure
        r"@\s*(\d+).*?"                        # SPM value
        r"([\d.]+)\s+Gal/Stoke.*?"              # Gal/Stoke
        r"([\d.]+)\s+GPM.*?"                    # GPM
        r"([\d.]+)\s+BPM.*?"                    # BPM
        r"([\d.]+)\s+DC.*?"                     # DC
        r"([\d.]+)\s+DP",                      # DP
        re.IGNORECASE | re.DOTALL
    )
    
    # Process each segment individually
    for seg in segments:
        seg = seg.strip()
        if not seg.startswith("Drilling/Circ Rate"):
            continue  # Skip any header or unrelated segments
        # Replace newline characters with spaces to form a continuous string
        seg_clean = " ".join(seg.splitlines())
        match = pattern.search(seg_clean)
        if match:
            rate_id, pressure, spm, gal_stroke, gpm, bpm, dc, dp = match.groups()
            circ_rates.append({
                "RateID": rate_id,
                "Pressure(PSI)": pressure,
                "SPM": spm,
                "Gal/Stoke": gal_stroke,
                "GPM": gpm,
                "BPM": bpm,
                "DC": dc,
                "DP": dp
            })
        else:
            # Optional: log a warning if no match is found for a segment
            print(f"Warning: No match found in segment:\n{seg_clean}")
            
    return circ_rates


def process_pumps(pumps_img_path, debug=False):
    """
    Processes the pumps section:
    #   - Reads image using PIL,
      - Performs OCR,
      - Parses pumps table and drilling/circ rates,
      - Returns combined results as JSON and a DataFrame.
    """
    pil_img = read_pil_image(pumps_img_path)
    ocr_text = perform_ocr(pil_img)
    if debug:
        logger.info("Pumps OCR Text:\n" + ocr_text)
    pumps = parse_pumps_table(ocr_text)
    circ_rates = parse_drilling_circ_rates(ocr_text)
    final_data = {"Pumps": pumps, "DrillingCircRates": circ_rates}
    df_pumps = pd.DataFrame(pumps)
    df_circ = pd.DataFrame(circ_rates)
    if not df_pumps.empty and not df_circ.empty:
        df = pd.concat([df_pumps, df_circ], axis=0, ignore_index=True)
    elif not df_pumps.empty:
        df = df_pumps
    else:
        df = df_circ
    return final_data, df

# # ---------------------------------------------------------------------
# # Main process Function
# # ---------------------------------------------------------------------
# def main():
#     # Define image paths (adjust as needed)
#     survey_img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png"
#     mud_img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png"
#     pumps_img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    
#     # Process Survey Section
#     try:
#         survey_json, survey_df = process_survey(survey_img_path, debug=True)
#         logger.info("===== SURVEY DATA =====")
#         logger.info(json.dumps(survey_json, indent=4))
#     except Exception as e:
#         logger.error(f"Survey processing failed: {e}")
    
#     # Process Mud Section
#     try:
#         mud_json, mud_df = process_mud(mud_img_path, debug=True)
#         logger.info("===== MUD DATA =====")
#         logger.info(json.dumps(mud_json, indent=4))
#     except Exception as e:
#         logger.error(f"Mud processing failed: {e}")
    
#     # Process Pumps Section
#     try:
#         pumps_json, pumps_df = process_pumps(pumps_img_path, debug=True)
#         logger.info("===== PUMPS DATA =====")
#         logger.info(json.dumps(pumps_json, indent=4))
#     except Exception as e:
#         logger.error(f"Pumps processing failed: {e}")
    
#     # Save outputs to disk (example output folder)
#     output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
#     os.makedirs(output_folder, exist_ok=True)
    
#     # Save Survey Data
#     survey_json_path = os.path.join(output_folder, "survey_data.json")
#     survey_csv_path = os.path.join(output_folder, "survey_data.csv")
#     with open(survey_json_path, "w") as f:
#         json.dump(survey_json, f, indent=4)
#     survey_df.to_csv(survey_csv_path, index=False)
#     logger.info(f"Survey data saved to {survey_json_path} and {survey_csv_path}")
    
#     # Save Mud Data
#     mud_json_path = os.path.join(output_folder, "mud_data.json")
#     mud_csv_path = os.path.join(output_folder, "mud_data.csv")
#     with open(mud_json_path, "w") as f:
#         json.dump(mud_json, f, indent=4)
#     mud_df.to_csv(mud_csv_path, index=False)
#     logger.info(f"Mud data saved to {mud_json_path} and {mud_csv_path}")
    
#     # Save Pumps Data
#     pumps_json_path = os.path.join(output_folder, "pumps_data.json")
#     pumps_csv_path = os.path.join(output_folder, "pumps_data.csv")
#     with open(pumps_json_path, "w") as f:
#         json.dump(pumps_json, f, indent=4)
#     pumps_df.to_csv(pumps_csv_path, index=False)
#     logger.info(f"Pumps data saved to {pumps_json_path} and {pumps_csv_path}")

# if __name__ == "__main__":
#     main()

In [0]:

def run_process(name, pipe, debug=False):
    """Runs an individual process and logs/saves its textual outputs."""
    logger.info(f"Running process: {name}")
    try:
        # Execute the process function and get JSON and DataFrame outputs.
        output_json, output_df = pipe["func"](None)
        
        # Define output folder and ensure sequential processing
        output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
        os.makedirs(output_folder, exist_ok=True)
        
        # Save JSON output
        json_path = os.path.join(output_folder, pipe["json"])
        with open(json_path, "w") as f:
            json.dump(output_json, f, indent=4)
        logger.info(f"{name} JSON data saved to {json_path}")
        
        # Save CSV output
        csv_path = os.path.join(output_folder, pipe["csv"])
        output_df.to_csv(csv_path, index=False)
        logger.info(f"{name} CSV data saved to {csv_path}")
        
        # Log textual output
        logger.info(f"{name} output JSON:\n{json.dumps(output_json, indent=4)}")
    except Exception as e:
        logger.error(f"Error in process '{name}': {e}")

def process_pumps_data(dummy_arg=None):
    pumps_img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    return process_pumps(pumps_img_path, debug=False)

# -----------------------------------------------------------------------------
# Main Function (Merged with processs Dictionary)
# -----------------------------------------------------------------------------
def main():
    debug = False  # Set True for detailed logging
    processs = {
        "cost_data": {
            "func": process_cost_data,
            "csv": "cost_data.csv",
            "json": "cost_data.json"
        },
        "well_job": {
            "func": lambda d: process_well_job_info("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png", d),
            "csv": "well_job_data.csv",
            "json": "well_job_data.json"
        },
        "obs_int": {
            "func": lambda d: process_obs_int("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png", d),
            "csv": "obs_int_data.csv",
            "json": "obs_int_data.json"
        },
        "bop": {
            "func": lambda d: process_bop("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png", d),
            "csv": "bop_data.csv",
            "json": "bop_data.json"
        },
        "dir_info": {
            "func": lambda d: process_dir_info("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png", d),
            "csv": "dir_info_data.csv",
            "json": "dir_info_data.json"
        },
        "survey": {
            "func": process_survey_data,
            "csv": "survey_data.csv",
            "json": "survey_data.json"
        },
        "casing": {
            "func": process_casing_data,
            "csv": "casing_data.csv",
            "json": "casing_data.json"
        },
        "consumables": {
            "func": process_consumables_data,
            "csv": "consumables_data.csv",
            "json": "consumables_data.json"
        },
        "mud": {
            "func": process_mud_data,
            "csv": "mud_data.csv",
            "json": "mud_data.json"
        },
        "bha": {
            "func": process_bha_data,
            "csv": "bha_data.csv",
            "json": "bha_data.json"
        },
        "pumps": {
            "func": process_pumps_data,
            "csv": "pumps_data.csv",
            "json": "pumps_data.json"
        }
    }
    
    # Process processs sequentially in the order defined in the dictionary.
    for name, pipe in processs.items():
        run_process(name, pipe, debug)

if __name__ == "__main__":
    main()


INFO: Running process: cost_data
INFO:UnifiedExtractor:Running process: cost_data
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png with shape (105, 2502, 3)
INFO:UnifiedExtractor:Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png with shape (105, 2502, 3)
INFO: Cost OCR extraction complete.
INFO:UnifiedExtractor:Cost OCR extraction complete.
INFO: COST DataFrame shape: (6, 2)
INFO:UnifiedExtractor:COST DataFrame shape: (6, 2)
INFO: cost_data JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.json
INFO:UnifiedExtractor:cost_data JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.json
INFO: cost_data CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.csv
INFO:UnifiedExtractor:cost_data CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.csv
INFO: cost_data output JSON:
{
    "COST DATA": {
        "Drilling AFE Amount": null,
        "Daily Drilling Cost": "$167,006.

In [0]:

# --------------------------------------------------------
# 5) Personnel Extraction process
# --------------------------------------------------------
def detect_text_regions_personnel(thresh_img, debug=False):
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    debug_img = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0, 255, 0), 2)
    rois.sort(key=lambda b: (b[1], b[0]))
    logger.debug(f"Detected {len(rois)} text regions for personnel.")
    if debug:
        show_image("Detected Personnel Text Regions", debug_img, size=(12, 12))
    return rois

def perform_ocr_on_rois_personnel(img, rois, debug=False):
    results = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config='--psm 6').strip()
        if not text:
            text = "[BLANK]"
        results.append((x, y, w, h, text))
        logger.debug(f"ROI bbox=({x}, {y}, {w}, {h}), text: '{text}'")
    return results

def group_rois_by_row(roi_results, threshold=20):
    roi_with_center = [(x, y, w, h, text, y + h/2) for (x, y, w, h, text) in roi_results]
    roi_with_center.sort(key=lambda r: r[5])
    groups = []
    current_group = []
    current_center = None
    for roi in roi_with_center:
        x, y, w, h, text, y_center = roi
        if current_center is None:
            current_center = y_center
            current_group.append((x, y, w, h, text))
        elif abs(y_center - current_center) < threshold:
            current_group.append((x, y, w, h, text))
        else:
            groups.append(current_group)
            current_group = [(x, y, w, h, text)]
            current_center = y_center
    if current_group:
        groups.append(current_group)
    logger.debug(f"Grouped ROIs into {len(groups)} rows.")
    return groups

def preprocess_personnel_data_from_rows(groups):
    personnel_data = []
    header_lines = {
        "personnel", 
        "company contractor no. personnel daily hours cumulative hours",
        "ssn"
    }
    for group in groups:
        group.sort(key=lambda r: r[0])
        row_text = " ".join([r[4] for r in group]).strip()
        logger.debug(f"Processing row: '{row_text}'")
        if row_text.lower() in header_lines:
            logger.debug("Skipping header row.")
            continue
        tokens = row_text.split()
        numeric_tokens = re.findall(r'\d+(?:\.\d+)?', row_text)
        logger.debug(f"Row tokens: {tokens}")
        logger.debug(f"Numeric tokens found: {numeric_tokens}")
        if tokens[0].lower().startswith("totals"):
            if len(numeric_tokens) >= 2:
                try:
                    daily_hours = int(float(numeric_tokens[0]))
                    cumulative_hours = numeric_tokens[1]
                except ValueError as e:
                    logger.error(f"Error parsing Totals row: {row_text} => {e}")
                    continue
                row_dict = {
                    "Company": "",
                    "Contractor": "",
                    "No. Personnel": "Totals",
                    "Daily Hours": daily_hours,
                    "Cumulative Hours": cumulative_hours
                }
                logger.info(f"Totals row parsed: {row_dict}")
                personnel_data.append(row_dict)
            else:
                logger.warning(f"Totals row without sufficient numbers: {row_text}")
            continue
        if len(numeric_tokens) >= 3:
            try:
                no_personnel = int(float(numeric_tokens[-3]))
                daily_hours = int(float(numeric_tokens[-2]))
                cumulative_hours = int(float(numeric_tokens[-1]))
                logger.debug(f"Extracted: no_personnel={no_personnel}, daily_hours={daily_hours}, cumulative_hours={cumulative_hours}")
            except ValueError as e:
                logger.error(f"Error converting numbers in row: {row_text} => {e}")
                continue
            pattern = (r'\s*' + re.escape(numeric_tokens[-3]) +
                       r'\s+' + re.escape(numeric_tokens[-2]) +
                       r'\s+' + re.escape(numeric_tokens[-1]) + r'\s*$')
            text_only = re.sub(pattern, '', row_text).strip()
        elif len(numeric_tokens) == 1:
            try:
                cumulative_hours = int(float(numeric_tokens[0]))
                logger.debug(f"Single numeric token, cumulative_hours: {cumulative_hours}")
            except ValueError as e:
                logger.error(f"Error converting single number in row: {row_text} => {e}")
                continue
            no_personnel = None
            daily_hours = None
            pattern = r'\s*' + re.escape(numeric_tokens[0]) + r'\s*$'
            text_only = re.sub(pattern, '', row_text).strip()
        else:
            logger.warning(f"Row has unexpected number of numeric tokens: {row_text}")
            continue

        if "service company" in text_only.lower():
            parts = re.split(r'(?i)service company', text_only, maxsplit=1)
            company = parts[0].strip()
            contractor = "Service Company"
        else:
            company = text_only
            contractor = "Service Company"
        
        row_dict = {
            "Company": company,
            "Contractor": contractor,
            "No. Personnel": no_personnel,
            "Daily Hours": daily_hours,
            "Cumulative Hours": cumulative_hours
        }
        logger.info(f"Parsed row: {row_dict}")
        personnel_data.append(row_dict)
    return {"PERSONNEL": personnel_data}

def process_personnel(personnel_img_path, debug=False):
    """Processes the personnel section: read image, detect ROIs, OCR, group and parse rows."""
    img = read_cropped_section_image(personnel_img_path)
    if debug:
        show_image("Original Personnel Image", img, size=(10,10))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh_img = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                       cv2.THRESH_BINARY, 11, 2)
    if debug:
        show_image("Thresholded Personnel Image", cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR), size=(8,8))
    rois = detect_text_regions_personnel(thresh_img, debug=debug)
    roi_results = perform_ocr_on_rois_personnel(img, rois, debug=debug)
    grouped_rows = group_rois_by_row(roi_results, threshold=20)
    data_dict = preprocess_personnel_data_from_rows(grouped_rows)
    df = pd.DataFrame(data_dict["PERSONNEL"]) if data_dict["PERSONNEL"] else pd.DataFrame(
        columns=["Company", "Contractor", "No. Personnel", "Daily Hours", "Cumulative Hours"])
    return data_dict, df

In [0]:
# --------------------------------------------------------
# 4) BOP Extraction process
# --------------------------------------------------------

def extract_bop_info(ocr_text):
    """Extract BOP information using regex."""
    pattern = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in pattern.items():
        match = re.search(regex, ocr_text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    return result

def process_bop(bop_img_path, debug=False):
    """Processes BOP section: read image, perform OCR, extract info, return JSON and DataFrame."""
    pil_img = read_pil_image(bop_img_path)
    ocr_text = perform_ocr(pil_img)
    bop_data = extract_bop_info(ocr_text)
    final_output = {"BOP": bop_data}
    df = pd.DataFrame(list(bop_data.items()), columns=["Key", "Value"])
    return final_output, df


def process_pumps_data(dummy_arg=None):
    pumps_img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    pil_img = read_pil_image(pumps_img_path)
    ocr_text = pytesseract.image_to_string(pil_img)
    # Dummy parsing logic for pumps; replace with your actual extraction.
    pumps = [{"Number": "1", "Model": "BOMCO", "Type": "TRIPLEX", "Efficiency": "95"}]
    circ_rates = []  # Assume drilling/circ rates not available in this dummy.
    final_data = {"Pumps": pumps, "DrillingCircRates": circ_rates}
    df = pd.DataFrame(pumps)
    return final_data, df

# --------------------------------------------------------
# 7) process Runner Helper Function
# --------------------------------------------------------
def run_process(name, pipe, debug=False):
    logger.info(f"Running process: {name}")
    try:
        output_json, output_df = pipe["func"](None)
        output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
        os.makedirs(output_folder, exist_ok=True)
        json_path = os.path.join(output_folder, pipe["json"])
        with open(json_path, "w") as f:
            json.dump(output_json, f, indent=4)
        logger.info(f"{name} JSON data saved to {json_path}")
        csv_path = os.path.join(output_folder, pipe["csv"])
        output_df.to_csv(csv_path, index=False)
        logger.info(f"{name} CSV data saved to {csv_path}")
        print(f"\n--- {name.upper()} JSON Output ---")
        print(json.dumps(output_json, indent=4))
        print(f"\n--- {name.upper()} DataFrame ---")
        print(output_df)
    except Exception as e:
        logger.error(f"Error in process '{name}': {e}")

# --------------------------------------------------------
# 8) Main process Function with processs Dictionary
# --------------------------------------------------------
def main():
    debug = False  # Set True for detailed logging and image display.
    processs = {
        "cost_data": {
            "func": process_cost_data,
            "csv": "cost_data.csv",
            "json": "cost_data.json"
        },
        "well_job": {
            "func": lambda d: process_well_job_info("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png", d),
            "csv": "well_job_data.csv",
            "json": "well_job_data.json"
        },
        "obs_int": {
            "func": lambda d: process_obs_int("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png", d),
            "csv": "obs_int_data.csv",
            "json": "obs_int_data.json"
        },
        "bop": {
            "func": lambda d: process_bop("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png", debug),
            "csv": "bop_data.csv",
            "json": "bop_data.json"
        },
        "dir_info": {
            "func": lambda d: process_dir_info("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png", d),
            "csv": "dir_info_data.csv",
            "json": "dir_info_data.json"
        },
        "survey": {
            "func": lambda d: process_survey("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png", debug),
            "csv": "survey_data.csv",
            "json": "survey_data.json"
        },
        "casing": {
            "func": process_casing_data,
            "csv": "casing_data.csv",
            "json": "casing_data.json"
        },
        "consumables": {
            "func": process_consumables_data,
            "csv": "consumables_data.csv",
            "json": "consumables_data.json"
        },
        "mud": {
            "func": process_mud_data,
            "csv": "mud_data.csv",
            "json": "mud_data.json"
        },
        "bha": {
            "func": extract_bha_data,
            "csv": "bha_data.csv",
            "json": "bha_data.json"
        },
        "pumps": {
            "func": process_pumps_data,
            "csv": "pumps_data.csv",
            "json": "pumps_data.json"
        },
        "personnel": {
            "func": lambda d: process_personnel("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_9.png", debug),
            "csv": "personnel_data.csv",
            "json": "personnel_data.json"
        }
    }
    
    for name, pipe in processs.items():
        run_process(name, pipe, debug)

if __name__ == "__main__":
    main()


INFO: Running process: cost_data
INFO:UnifiedExtractor:Running process: cost_data
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png with shape (105, 2502, 3)
INFO:UnifiedExtractor:Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png with shape (105, 2502, 3)
INFO: Cost OCR extraction complete.
INFO:UnifiedExtractor:Cost OCR extraction complete.
INFO: COST DataFrame shape: (6, 2)
INFO:UnifiedExtractor:COST DataFrame shape: (6, 2)
INFO: cost_data JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.json
INFO:UnifiedExtractor:cost_data JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.json
INFO: cost_data CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.csv
INFO:UnifiedExtractor:cost_data CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/cost_data.csv
INFO: Running process: well_job
INFO:UnifiedExtractor:Running process: well_job
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/c


--- COST_DATA JSON Output ---
{
    "COST DATA": {
        "Drilling AFE Amount": null,
        "Daily Drilling Cost": "$167,006.63",
        "Cumulative Drilling Cost": "$1,747,745",
        "Cumulative Well Cost": "$1,914,752",
        "Daily Mud Cost": "$54,185.80",
        "Cumulative Mud Cost": "$299,370.66"
    }
}

--- COST_DATA DataFrame ---
                        Key        Value
0       Drilling AFE Amount         None
1       Daily Drilling Cost  $167,006.63
2  Cumulative Drilling Cost   $1,747,745
3      Cumulative Well Cost   $1,914,752
4            Daily Mud Cost   $54,185.80
5       Cumulative Mud Cost  $299,370.66


INFO: Well/Job OCR extraction complete.
INFO:UnifiedExtractor:Well/Job OCR extraction complete.
INFO: WELL/JOB DataFrame shape: (19, 2)
INFO:UnifiedExtractor:WELL/JOB DataFrame shape: (19, 2)
INFO: well_job JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/well_job_data.json
INFO:UnifiedExtractor:well_job JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/well_job_data.json
INFO: well_job CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/well_job_data.csv
INFO:UnifiedExtractor:well_job CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/well_job_data.csv
INFO: Running process: obs_int
INFO:UnifiedExtractor:Running process: obs_int
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png with shape (241, 942, 3)
INFO:UnifiedExtractor:Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png with shape (241, 942, 3)



--- WELL_JOB JSON Output ---
{
    "WELL/JOB INFORMATION": {
        "Well Name": "Ross Fee 4371-31-7-15 MH",
        "Job Name": "Drilling",
        "Supervisor(s)": "CHAD MILLER / ED COOLEY",
        "Field": "XBE",
        "Sec/Twn/Rng": "31, 43N, 71W",
        "Phone": "307-315-1908",
        "AFE #": "240098",
        "API #": "49-005-78911",
        "Email": "cyclone39@aec-denver.com",
        "Contractor": "",
        "Elevation": "4913.5",
        "RKB": "27.5",
        "Spud Date": "6/4/2024",
        "Days from Spud": "7.67",
        "Days on Loc": "34",
        "MD/TVD": "20537 FT/10719 FT",
        "24 Hr Footage": "3068",
        "Present Operations": "DRILLING LATERAL @ 20,537'.",
        "Activity Planned": "DRILL LATERAL SECTION TO PLANNED TD @ ~21,226', PUMP TD SWEEPS & CHC, SOOH & L/D DRILL PIPE."
    }
}

--- WELL_JOB DataFrame ---
                   Key                                              Value
0            Well Name                           Ross Fee 4371

INFO: DAILY NUMBERS: OBSERVATION & INTERVENTION DataFrame shape: (5, 2)
INFO:UnifiedExtractor:DAILY NUMBERS: OBSERVATION & INTERVENTION DataFrame shape: (5, 2)
INFO: obs_int JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/obs_int_data.json
INFO:UnifiedExtractor:obs_int JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/obs_int_data.json
INFO: obs_int CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/obs_int_data.csv
INFO:UnifiedExtractor:obs_int CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/obs_int_data.csv
INFO: Running process: bop
INFO:UnifiedExtractor:Running process: bop
ERROR: Error in process 'bop': name 'read_pil_image' is not defined
ERROR:UnifiedExtractor:Error in process 'bop': name 'read_pil_image' is not defined
INFO: Running process: dir_info
INFO:UnifiedExtractor:Running process: dir_info
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png with shape (241, 1200, 3)
INFO:UnifiedExtractor:Image loaded from /dbfs/m


--- OBS_INT JSON Output ---
{
    "DAILY NUMBERS: OBSERVATION & INTERVENTION": [
        {
            "Type": "Stop Cards",
            "Number": "14"
        },
        {
            "Type": "Hazard ID's",
            "Number": "2"
        },
        {
            "Type": "JSA's",
            "Number": "5"
        },
        {
            "Type": "Permit to Work",
            "Number": "21"
        },
        {
            "Type": "Totals",
            "Number": ""
        }
    ]
}

--- OBS_INT DataFrame ---
             Type Number
0      Stop Cards     14
1     Hazard ID's      2
2           JSA's      5
3  Permit to Work     21
4          Totals       


INFO: DIR INFO DataFrame shape: (5, 3)
INFO:UnifiedExtractor:DIR INFO DataFrame shape: (5, 3)
INFO: dir_info JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/dir_info_data.json
INFO:UnifiedExtractor:dir_info JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/dir_info_data.json
INFO: dir_info CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/dir_info_data.csv
INFO:UnifiedExtractor:dir_info CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/dir_info_data.csv
INFO: Running process: survey
INFO:UnifiedExtractor:Running process: survey
ERROR: Error in process 'survey': cannot unpack non-iterable NoneType object
ERROR:UnifiedExtractor:Error in process 'survey': cannot unpack non-iterable NoneType object
INFO: Running process: casing
INFO:UnifiedExtractor:Running process: casing
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_7.png with shape (241, 2502, 3)
INFO:UnifiedExtractor:Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_


--- DIR_INFO JSON Output ---
{
    "DIR INFO": [
        {
            "Category": "Circ/Cond Hours",
            "Daily": "",
            "Cumulative": "6.8"
        },
        {
            "Category": "Sliding Hours",
            "Daily": "5.8",
            "Cumulative": "28.4"
        },
        {
            "Category": "Sliding Footage",
            "Daily": "247",
            "Cumulative": "1488"
        },
        {
            "Category": "Rotating Hours",
            "Daily": "17.8",
            "Cumulative": "75.9"
        },
        {
            "Category": "Rotating Footage",
            "Daily": "2821",
            "Cumulative": "18941"
        }
    ]
}

--- DIR_INFO DataFrame ---
           Category Daily Cumulative
0   Circ/Cond Hours              6.8
1     Sliding Hours   5.8       28.4
2   Sliding Footage   247       1488
3    Rotating Hours  17.8       75.9
4  Rotating Footage  2821      18941


INFO: CASING - Skipping header line: ['Type', 'Size', 'Weight', 'Grade', 'Connection', 'Top', 'MD', 'Bottom', 'MD', 'TOC']
INFO:UnifiedExtractor:CASING - Skipping header line: ['Type', 'Size', 'Weight', 'Grade', 'Connection', 'Top', 'MD', 'Bottom', 'MD', 'TOC']
INFO: CASING DataFrame shape: (4, 8)
INFO:UnifiedExtractor:CASING DataFrame shape: (4, 8)
INFO: casing JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/casing_data.json
INFO:UnifiedExtractor:casing JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/casing_data.json
INFO: casing CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/casing_data.csv
INFO:UnifiedExtractor:casing CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/casing_data.csv
INFO: Running process: consumables
INFO:UnifiedExtractor:Running process: consumables
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_2_section_2.png with shape (209, 2502, 3)
INFO:UnifiedExtractor:Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections


--- CASING JSON Output ---
{
    "CASING": [
        {
            "Type": "Conductor",
            "Size": "16.000",
            "Weight": "36.94",
            "Grade": "A252",
            "Connection": "WELDED",
            "Top MD": "32.00",
            "Bottom MD": "108.00",
            "TOC": "16"
        },
        {
            "Type": "Surface",
            "Size": "10.750",
            "Weight": "40.5",
            "Grade": "J55",
            "Connection": "BTC",
            "Top MD": "31.17",
            "Bottom MD": "2268.00",
            "TOC": "30"
        },
        {
            "Type": "Intermediate",
            "Size": "7.625",
            "Weight": "29.7",
            "Grade": "HCP110",
            "Connection": "BTC",
            "Top MD": "28.89",
            "Bottom MD": "9857.70",
            "TOC": "2750"
        },
        {
            "Type": "[BLANK]",
            "Size": "[BLANK]",
            "Weight": "[BLANK]",
            "Grade": "[BLANK]",
          

INFO: CONSUMABLES DataFrame shape: (3, 5)
INFO:UnifiedExtractor:CONSUMABLES DataFrame shape: (3, 5)
INFO: consumables JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/consumables_data.json
INFO:UnifiedExtractor:consumables JSON data saved to /dbfs/mnt/mini-proj-dd/final_results/consumables_data.json
INFO: consumables CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/consumables_data.csv
INFO:UnifiedExtractor:consumables CSV data saved to /dbfs/mnt/mini-proj-dd/final_results/consumables_data.csv
INFO: Running process: mud
INFO:UnifiedExtractor:Running process: mud
INFO: Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png with shape (173, 2502, 3)
INFO:UnifiedExtractor:Image loaded from /dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png with shape (173, 2502, 3)



--- CONSUMABLES JSON Output ---
{
    "CONSUMABLES": [
        {
            "Consumable": "Fuel",
            "Daily Received (gal)": "[BLANK]",
            "Daily Used (gal)": "1,386",
            "Cumulative Used (gal)": "20,626",
            "Daily on Hand (gal)": "5,735"
        },
        {
            "Consumable": "CNG (DGE)",
            "Daily Received (gal)": "1,652",
            "Daily Used (gal)": "1,652",
            "Cumulative Used (gal)": "6,535",
            "Daily on Hand (gal)": "[BLANK]"
        },
        {
            "Consumable": "Mud Fuel",
            "Daily Received (gal)": "8,367",
            "Daily Used (gal)": "[BLANK]",
            "Cumulative Used (gal)": "24,150",
            "Daily on Hand (gal)": "11,643"
        }
    ]
}

--- CONSUMABLES DataFrame ---
  Consumable Daily Received (gal)  ... Cumulative Used (gal) Daily on Hand (gal)
0       Fuel              [BLANK]  ...                20,626               5,735
1  CNG (DGE)                1,652  .

INFO: Row 0 text: MUD MUD [BLANK]
INFO:UnifiedExtractor:Row 0 text: MUD MUD [BLANK]
INFO: Row 1 text: Type Weight In Weight Out pH CAKE GELS (10s/10m/30m) Oil/Water FV ES PV YP CL Ca LGS WL HTHP Loss
INFO:UnifiedExtractor:Row 1 text: Type Weight In Weight Out pH CAKE GELS (10s/10m/30m) Oil/Water FV ES PV YP CL Ca LGS WL HTHP Loss
INFO: Row 2 text: OBM 11.5 11.5 [BLANK] 3 8 25 27 88/12 60.0 753 16 8 31,000 326,667 4.47 [BLANK] 5.00
INFO:UnifiedExtractor:Row 2 text: OBM 11.5 11.5 [BLANK] 3 8 25 27 88/12 60.0 753 16 8 31,000 326,667 4.47 [BLANK] 5.00
INFO: Row 3 text: 3 RPM 6 RPM Mud Pits and Hole Volume 24 Hr Loss Total Loss Comments
INFO:UnifiedExtractor:Row 3 text: 3 RPM 6 RPM Mud Pits and Hole Volume 24 Hr Loss Total Loss Comments
INFO: Row 4 text: 4 5 1023 13 481 [BLANK]
INFO:UnifiedExtractor:Row 4 text: 4 5 1023 13 481 [BLANK]
INFO: Header1: Type Weight In Weight Out pH CAKE GELS (10s/10m/30m) Oil/Water FV ES PV YP CL Ca LGS WL HTHP Loss
INFO:UnifiedExtractor:Header1: Type Weight In


--- MUD JSON Output ---
{
    "MUD": {
        "Type": "OBM",
        "Weight In": "11.5",
        "Weight Out": "11.5",
        "pH": "[BLANK]",
        "CAKE": "3",
        "GELS (10s/10m/30m)": {
            "10s": "8",
            "10m": "25",
            "30m": "27"
        },
        "Oil/Water": "88/12",
        "FV": "60.0",
        "ES": "753",
        "PV": "16",
        "YP": "8",
        "CL": "31,000",
        "Ca": "326,667",
        "LGS": "4.47",
        "WL": "[BLANK]",
        "HTHP Loss": "5.00",
        "3 RPM": "4",
        "6 RPM": "5",
        "Mud Pits and Hole Volume": "1023",
        "24 Hr Loss": "13",
        "Total Loss": "481",
        "Comments": "[BLANK]"
    }
}

--- MUD DataFrame ---
                         Key                                   Value
0                       Type                                     OBM
1                  Weight In                                    11.5
2                 Weight Out                                    1

INFO: Parsed row: {'Company': 'WORKRISE', 'Contractor': 'Service Company', 'No. Personnel': 2, 'Daily Hours': 24, 'Cumulative Hours': 2777}
INFO:UnifiedExtractor:Parsed row: {'Company': 'WORKRISE', 'Contractor': 'Service Company', 'No. Personnel': 2, 'Daily Hours': 24, 'Cumulative Hours': 2777}
INFO: Parsed row: {'Company': 'Cyclone Drilling Days Crews', 'Contractor': 'Service Company', 'No. Personnel': 7, 'Daily Hours': 84, 'Cumulative Hours': 2777}
INFO:UnifiedExtractor:Parsed row: {'Company': 'Cyclone Drilling Days Crews', 'Contractor': 'Service Company', 'No. Personnel': 7, 'Daily Hours': 84, 'Cumulative Hours': 2777}
INFO: Parsed row: {'Company': 'Cyclone Drilling Night Crews', 'Contractor': 'Service Company', 'No. Personnel': 7, 'Daily Hours': 84, 'Cumulative Hours': 2777}
INFO:UnifiedExtractor:Parsed row: {'Company': 'Cyclone Drilling Night Crews', 'Contractor': 'Service Company', 'No. Personnel': 7, 'Daily Hours': 84, 'Cumulative Hours': 2777}
INFO: Parsed row: {'Company': 'DCT


--- PERSONNEL JSON Output ---
{
    "PERSONNEL": [
        {
            "Company": "WORKRISE",
            "Contractor": "Service Company",
            "No. Personnel": 2,
            "Daily Hours": 24,
            "Cumulative Hours": 2777
        },
        {
            "Company": "Cyclone Drilling Days Crews",
            "Contractor": "Service Company",
            "No. Personnel": 7,
            "Daily Hours": 84,
            "Cumulative Hours": 2777
        },
        {
            "Company": "Cyclone Drilling Night Crews",
            "Contractor": "Service Company",
            "No. Personnel": 7,
            "Daily Hours": 84,
            "Cumulative Hours": 2777
        },
        {
            "Company": "DCT",
            "Contractor": "Service Company",
            "No. Personnel": 2,
            "Daily Hours": 24,
            "Cumulative Hours": 2777
        },
        {
            "Company": "",
            "Contractor": "",
            "No. Personnel": "Totals",
    

In [0]:
# ---------------------------------------------------------------------
# 3) Survey Extraction process
# ---------------------------------------------------------------------
def build_survey_dict_from_rois(roi_texts, expected_headers):
    row_tolerance = 10
    rows = []
    current_row = []
    prev_y = None
    for (x, y, w, h, text) in roi_texts:
        if prev_y is None or abs(y - prev_y) <= row_tolerance:
            current_row.append((x, y, w, h, text))
        else:
            rows.append(current_row)
            current_row = [(x, y, w, h, text)]
        prev_y = y
    if current_row:
        rows.append(current_row)
    
    row_strings = []
    for i, row in enumerate(rows):
        row.sort(key=lambda cell: cell[0])
        line = " ".join(cell[4] for cell in row)
        row_strings.append(line)
        logger.info(f"Grouped Row {i}: {line}")
    
    all_lines = []
    for line in row_strings:
        for subline in line.split("\n"):
            subline = subline.strip()
            if subline:
                all_lines.append(subline)
    logger.info(f"All extracted lines: {all_lines}")
    
    data_lines = []
    for line in all_lines:
        tokens = re.split(r'\s{2,}', line)
        if len(tokens) == 1:
            tokens = line.split()
        lower_tokens = [t.lower() for t in tokens]
        if "md" in lower_tokens and "inclination" in lower_tokens:
            logger.info(f"Skipping header line: {tokens}")
            continue
        if len(tokens) < len(expected_headers):
            logger.warning(f"Line has fewer tokens than expected: {tokens}")
            continue
        tokens = tokens[:len(expected_headers)]
        data_lines.append(tokens)
    
    survey_list = []
    for tokens in data_lines:
        row_dict = {expected_headers[i]: tokens[i] for i in range(len(expected_headers))}
        survey_list.append(row_dict)
    return survey_list

def sort_survey_data(survey_list):
    def md_value(row):
        try:
            return float(row["MD"].replace(",", ""))
        except Exception:
            return 0
    sorted_list = sorted(survey_list, key=md_value, reverse=True)
    filtered_list = [row for row in sorted_list if not row["MD"].upper().startswith("SURVEY")]
    return filtered_list

def process_survey(survey_img_path, debug=False):
    expected_headers = ["MD", "Inclination", "Azimuth", "DLS", "TVD"]
    img = safe_read_image(survey_img_path)
    if debug:
        show_image("Original Survey Image", img, size=(12,12))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 15, 9
    )
    if debug:
        show_image("Adaptive Threshold", thresh, cmap="gray")
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
    rois.sort(key=lambda b: (b[1], b[0]))
    if debug:
        debug_img = cv2.cvtColor(thresh, cv2.COLOR_GRAY2BGR)
        for (x, y, w, h) in rois:
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0,255,0), 2)
        show_image("Detected Text Regions", debug_img)
    roi_texts = []
    for (x, y, w, h) in rois:
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6").strip()
        if not text:
            text = "[BLANK]"
        roi_texts.append((x, y, w, h, text))
        if debug:
            logger.info(f"OCR Box ({x},{y},{w},{h}): {text}")
    survey_list = build_survey_dict_from_rois(roi_texts, expected_headers)
    survey_list = sort_survey_data(survey_list)
    final_output = {"SURVEY DATA": survey_list}
    df = pd.DataFrame(survey_list)
    return final_output, df

# ---------------------------------------------------------------------
# 4) BOP Extraction process
# ---------------------------------------------------------------------
# def perform_ocr_bop(img):
#     text = pytesseract.image_to_string(img)
#     logger.info("BOP OCR extraction complete.")
#     return text

def extract_bop_info(ocr_text):
    pattern = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in pattern.items():
        match = re.search(regex, ocr_text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    return result

def process_bop(bop_img_path, debug=False):
    pil_img = read_pil_image(bop_img_path)
    ocr_text = perform_ocr_bop(pil_img)
    bop_data = extract_bop_info(ocr_text)
    final_output = {"BOP": bop_data}
    df = pd.DataFrame(list(bop_data.items()), columns=["Key", "Value"])
    return final_output, df

# ---------------------------------------------------------------------
# 6) BHA Extraction process
# ---------------------------------------------------------------------
def extract_bha_data(image_path):
    image = Image.open(image_path)
    ocr_text = pytesseract.image_to_string(image)
    patterns = {
        "Drill Pipe Detail": r"Drill Pipe Detail:\s*([^\n]+)",
        "Size": r"Size:\s*([\d.]+)\b",
        "Wt./Ft": r"Wt\./Ft:\s*([\d.]+)\b",
        "Connection": r"Connection:\s*([\w\d-]+)\b",
        "ID": r"ID:\s*([\d.]+)\b",
        "Drill Bit": r"Drill Bit:\s*([^\n;]+)",
        "Motor": r"Motor:\s*([^\n;]+)",
        "MWD Tool": r"MWD Tool:\s*([^\n;]+)",
        "Monel Collar": r"Monel Collar:\s*([^\n;]+)",
        "X-Over": r"X-Over:\s*([^\n;]+)",
        "Sub": r"Sub:\s*([^\n;]+)",
        "HWDP": r"HWDP:\s*([^\n;]+)",
        "Drill Pipe": r"Drill Pipe:\s*([\d.]+(?:\" DP)?)",
        "Reamer": r"Reamer:\s*([^\n;]+)",
        "Shock Sub": r"Shock Sub:\s*([^\n;]+)",
        "Total Length": r"Total Length:\s*(\d+)\b"
    }
    bha_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, ocr_text)
        if match:
            bha_data[key] = match.group(1).strip()
    if "Drill Pipe Detail" in bha_data:
        detail = bha_data["Drill Pipe Detail"]
        for remove_key in ["Size", "Wt./Ft", "Connection", "ID"]:
            if remove_key in bha_data:
                detail = re.sub(rf"{remove_key}:\s*{re.escape(bha_data[remove_key])}", "", detail).strip(",; ")
        bha_data["Drill Pipe Detail"] = detail
    structured_data = {
        "BHA": {
            "Drill Pipe Detail": bha_data.get("Drill Pipe Detail", ""),
            "Size": bha_data.get("Size", ""),
            "Wt./Ft": bha_data.get("Wt./Ft", ""),
            "Connection": bha_data.get("Connection", ""),
            "ID": bha_data.get("ID", ""),
            "BHA #4": {
                "Drill Bit": bha_data.get("Drill Bit", ""),
                "Motor": bha_data.get("Motor", ""),
                "MWD Tool": bha_data.get("MWD Tool", ""),
                "Monel Collar": bha_data.get("Monel Collar", ""),
                "X-Over": bha_data.get("X-Over", ""),
                "Sub": bha_data.get("Sub", ""),
                "HWDP": bha_data.get("HWDP", ""),
                "Drill Pipe": bha_data.get("Drill Pipe", ""),
                "Reamer": bha_data.get("Reamer", ""),
                "Shock Sub": bha_data.get("Shock Sub", "")
            },
            "Total Length": bha_data.get("Total Length", "")
        }
    }
    return structured_data

def process_bha(bha_img_path, debug=False):
    bha_json = extract_bha_data(bha_img_path)
    df = pd.json_normalize(bha_json["BHA"])
    return {"BHA": bha_json["BHA"]}, df

In [0]:
# ---------------------------------------------------------------------
# 7) Time Breakdown process
# ---------------------------------------------------------------------
# (The following functions are taken and merged from your provided time breakdown code.)
def preprocess_image(img, debug=False):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if debug:
        show_image("1) Grayscale", gray, cmap="gray", size=(10,10))
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 15, 9)
    if debug:
        show_image("2) Adaptive Threshold", thresh, cmap="gray", size=(10,10))
    return thresh

def detect_text_regions_tb(thresh_img, debug=True):
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rois = []
    debug_img = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 30 and h > 15:
            rois.append((x, y, w, h))
            cv2.rectangle(debug_img, (x, y), (x+w, y+h), (0,255,0), 2)
    rois.sort(key=lambda b: (b[1], b[0]))
    if debug:
        show_image("3) Detected Text Regions", debug_img, size=(12,12))
    return rois

def perform_ocr_on_rois_tb(img, rois, debug=True):
    results = []
    n = len(rois)
    if debug and n > 0:
        cols = 5
        rows = math.ceil(n / cols)
        fig, axes = plt.subplots(rows, cols, figsize=(15, 3 * rows))
        axes = axes.flatten() if rows > 1 else [axes]
    for i, (x, y, w, h) in enumerate(rois):
        roi = img[y:y+h, x:x+w]
        text = pytesseract.image_to_string(roi, config="--psm 6").strip()
        if not text:
            text = "[BLANK]"
        results.append((x, y, w, h, text))
        if debug and i < len(axes):
            roi_rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
            axes[i].imshow(roi_rgb)
            axes[i].set_title(f"ROI {i+1}\n{text[:30]}...")
            axes[i].axis("off")
    if debug and n > 0:
        for j in range(i + 1, len(axes)):
            axes[j].axis("off")
        plt.tight_layout()
        plt.show()
    return results

def group_ocr_rows(roi_results, y_threshold=20):
    groups = []
    roi_results_sorted = sorted(roi_results, key=lambda r: r[1])
    current_group = []
    current_y = None
    for (x, y, w, h, text) in roi_results_sorted:
        if current_y is None:
            current_y = y
            current_group.append((x, y, w, h, text))
        elif abs(y - current_y) <= y_threshold:
            current_group.append((x, y, w, h, text))
        else:
            groups.append(current_group)
            current_group = [(x, y, w, h, text)]
            current_y = y
    if current_group:
        groups.append(current_group)
    return groups

def parse_operations_description(ops_text):
    ops_data = {
        "Depth": {"From": "", "To": ""},
        "Performance": {"Feet": "", "FPH": ""},
        "Rotation_Slide": {"Rotate": "", "Slide": ""},
        "Rotation_Time": {"Rotate Time": "", "Slide Time": ""},
        "GPM": "",
        "MTR RPM": "",
        "SPP": "",
        "DIFF": "",
        "WOB": "",
        "ROT RPM": "",
        "ON BTM TRQ": "",
        "OFF BTM TRQ": "",
        "GAS": {"Units": "", "Flare": ""},
        "MW": {"In": "", "Out": ""},
        "Targets": [],
        "Observations": []
    }
    depth_match = re.search(r"F/\s*([\d,']+)\s*T/\s*([\d,']+)", ops_text, re.IGNORECASE)
    if depth_match:
        ops_data["Depth"]["From"] = depth_match.group(1)
        ops_data["Depth"]["To"] = depth_match.group(2)
    perf_match = re.search(r"\(([\d,']+)\s*@\s*(\d+)\s*FPH\)", ops_text, re.IGNORECASE)
    if perf_match:
        ops_data["Performance"]["Feet"] = perf_match.group(1)
        ops_data["Performance"]["FPH"] = perf_match.group(2)
    rs_match = re.search(r"ROTATE\s*([\d.]+%)\s*/\s*SLIDE\s*([\d.]+%)", ops_text, re.IGNORECASE)
    if rs_match:
        ops_data["Rotation_Slide"]["Rotate"] = rs_match.group(1)
        ops_data["Rotation_Slide"]["Slide"] = rs_match.group(2)
    rt_match = re.search(r"ROTATE\s*TIME\s*([\d.]+%)\s*/\s*SLIDE\s*TIME\s*([\d.]+%)", ops_text, re.IGNORECASE)
    if rt_match:
        ops_data["Rotation_Time"]["Rotate Time"] = rt_match.group(1)
        ops_data["Rotation_Time"]["Slide Time"] = rt_match.group(2)
    numeric_patterns = {
        "GPM": r"GPM:\s*(\d+)",
        "MTR RPM": r"MTR\s*RPM:\s*(\d+)",
        "SPP": r"SPP:\s*([\d,]+(?:-\d+)?)(?:,|\s|$)",
        "DIFF": r"DIFF:\s*([\d\-]+)",
        "WOB": r"WOB:\s*([\d,]+(?:-\d+)?)(?:,|\s|$)",
        "ROT RPM": r"ROT\s*RPM:\s*([\d,]+(?:-\d+)?)(?:,|\s|$)",
        "ON BTM TRQ": r"ON\s*BTM\s*TRQ[:;]?\s*([\d\-K]+)",
        "OFF BTM TRQ": r"OFF\s*BTM\s*TRQ[:;]?\s*([\d\-K]+)"
    }
    for key, pattern in numeric_patterns.items():
        m = re.search(pattern, ops_text, re.IGNORECASE)
        if m:
            ops_data[key] = m.group(1)
    gas_units = re.search(r"GAS:\s*([\d,]+)\s*UNITS", ops_text, re.IGNORECASE)
    if gas_units:
        ops_data["GAS"]["Units"] = gas_units.group(1)
    flare = re.search(r"(NO\s*FLARE|FLARE\s*ON|FLARE\s*\S+)", ops_text, re.IGNORECASE)
    if flare:
        ops_data["GAS"]["Flare"] = flare.group(1)
    mw_match = re.search(r"MW\s*IN\s*([\d.+]+)\s*PPG\s*/\s*OUT\s*([\d.+]+)\s*PPG", ops_text, re.IGNORECASE)
    if mw_match:
        ops_data["MW"]["In"] = mw_match.group(1)
        ops_data["MW"]["Out"] = mw_match.group(2)
    header_match = re.search(r".*MW\s*IN\s*[\d.+]+\s*PPG\s*/\s*OUT\s*[\d.+]+\s*PPG\.", ops_text, re.IGNORECASE)
    if header_match:
        residual = ops_text[header_match.end():]
    else:
        residual = ops_text
    segments = re.split(r'(?=\*\*\*)', residual)
    obs_list = []
    for seg in segments:
        seg = seg.strip()
        if not seg:
            continue
        if not seg.startswith('***'):
            parts = [p.strip() for p in seg.split('.') if p.strip()]
            obs_list.extend(parts)
        else:
            obs_list.append(seg)
    obs_list = [o.lstrip('* ').strip() for o in obs_list]
    clean_obs = [o for o in obs_list if "TARGET" not in o.upper()]
    targets = [o for o in obs_list if "TARGET" in o.upper()]
    targets = [t.lstrip('* ').strip() for t in targets]
    ops_data["Observations"] = clean_obs
    ops_data["Targets"] = targets
    return ops_data

def parse_row_text(row_text):
    clean_text = " ".join(row_text.split())
    if "Daily Hrs" in clean_text:
        pattern = r"Daily Hrs\s+(\S+)\s+Daily NPT Hrs\s*(\S*)\s+Total Job NPT Hours\s+(\S+)"
        m = re.search(pattern, clean_text, re.IGNORECASE)
        if m:
            return {
                "Daily Summary": {
                    "Daily Hrs": m.group(1),
                    "Daily NPT Hrs": m.group(2),
                    "Total Job NPT Hours": m.group(3)
                }
            }
        else:
            logger.warning(f"Daily summary row detected but could not parse: {clean_text}")
            return None
    tokens = clean_text.split()
    if not tokens or not re.match(r"\d{2}:\d{2}", tokens[0]):
        logger.info(f"Skipping header or invalid row: {clean_text}")
        return None
    if len(tokens) < 8:
        logger.warning(f"Row does not have enough tokens: {clean_text}")
        return None
    from_time = tokens[0]
    to_time = tokens[1]
    hours = tokens[2]
    depth_start = tokens[3]
    depth_end = tokens[4]
    header_rest = " ".join(tokens[5:])
    m = re.search(r"^(?P<phase>.+?)\s+(?P<activity>DR[-]?Drilling)\s+(?P<ops>.*)$", header_rest, re.IGNORECASE)
    if m:
        phase = m.group("phase")
        activity = m.group("activity")
        ops = m.group("ops")
    else:
        phase = tokens[5]
        activity = tokens[6] if len(tokens) > 6 else ""
        ops = " ".join(tokens[7:]) if len(tokens) > 7 else ""
    return {
        "From": from_time,
        "To": to_time,
        "Hours": hours,
        "Depth Start": depth_start,
        "Depth End": depth_end,
        "Phase": phase,
        "Activity": activity,
        "Operations Description": parse_operations_description(ops)
    }

def parse_all_rows_from_text(full_text):
    if re.search(r"\d{2}:\d{2}\s+\d{2}:\d{2}", full_text):
        row_chunks = re.split(r"(?=\d{2}:\d{2}\s+\d{2}:\d{2})", full_text)
        rows = []
        for chunk in row_chunks:
            chunk = chunk.strip()
            if not chunk:
                continue
            row = parse_row_text(chunk)
            if row:
                rows.append(row)
        return rows
    else:
        fallback_row = {
            "From": "",
            "To": "",
            "Hours": "",
            "Depth Start": "",
            "Depth End": "",
            "Phase": "",
            "Activity": "",
            "Operations Description": parse_operations_description(full_text)
        }
        return [fallback_row]

def parse_all_rows_from_ocr_groups(roi_results):
    rows = []
    groups = group_ocr_rows(roi_results, y_threshold=20)
    for group in groups:
        group_sorted = sorted(group, key=lambda r: r[0])
        row_text = " ".join([text for (x, y, w, h, text) in group_sorted])
        if any(kw in row_text.upper() for kw in ["TIME PERIOD", "FROM TO", "DEPTH PHASE", "OPERATIONS DESCRIPTION"]):
            continue
        parsed_row = parse_row_text(row_text)
        if parsed_row:
            rows.append(parsed_row)
    return rows

def main_time_breakdown_process():
    img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_14.png"
    try:
        img = safe_read_image(img_path)
        logger.info("Time Breakdown image loaded successfully.")
    except Exception as e:
        logger.error(e)
        return
    thresh_img = preprocess_image(img, debug=True)
    rois = detect_text_regions_tb(thresh_img, debug=True)
    roi_ocr_results = perform_ocr_on_rois_tb(img, rois, debug=True)
    time_breakdown_list = parse_all_rows_from_ocr_groups(roi_ocr_results)
    if not time_breakdown_list:
        logger.warning("No rows detected from ROI grouping. Falling back to full-image OCR.")
        full_text = pytesseract.image_to_string(thresh_img, config="--psm 6")
        time_breakdown_list = parse_all_rows_from_text(full_text)
    if not time_breakdown_list:
        logger.error("No rows were detected. Please check the OCR output and header format.")
        return
    final_output = {"TIME BREAKDOWN": time_breakdown_list}
    logger.info("===== FINAL TIME BREAKDOWN DATA =====")
    logger.info(json.dumps(final_output, indent=4))
    df = pd.json_normalize(final_output["TIME BREAKDOWN"])
    print("----- Extracted Time Breakdown DataFrame -----")
    print(df)
    output_folder = "dbfs:/mnt/mini-proj-dd/final_time_breakdown_results"
    local_folder = output_folder.replace("dbfs:", "/dbfs")
    os.makedirs(local_folder, exist_ok=True)
    out_json = os.path.join(local_folder, "time_breakdown_data.json")
    with open(out_json, "w") as f:
        json.dump(final_output, f, indent=4)
    logger.info(f"Time Breakdown JSON saved to {out_json}")
    out_csv = os.path.join(local_folder, "time_breakdown_data.csv")
    df.to_csv(out_csv, index=False)
    logger.info(f"Time Breakdown CSV saved to {out_csv}")
    return final_output, df

def merge_time_breakdown_data(main_data, continuation_data):
    return main_data + continuation_data

def process_time_breakdown_image(img_path, debug=False):
    img = safe_read_image(img_path)
    thresh_img = preprocess_image(img, debug=debug)
    rois = detect_text_regions_tb(thresh_img, debug=debug)
    roi_ocr_results = perform_ocr_on_rois_tb(img, rois, debug=debug)
    rows = parse_all_rows_from_ocr_groups(roi_ocr_results)
    if not rows:
        full_text = pytesseract.image_to_string(thresh_img, config="--psm 6")
        rows = parse_all_rows_from_text(full_text)
    return rows

def process_pumps_data(dummy_arg=None):
    pumps_img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    pil_img = read_pil_image(pumps_img_path)
    ocr_text = pytesseract.image_to_string(pil_img)
    pumps = [{"Number": "1", "Model": "BOMCO", "Type": "TRIPLEX", "Efficiency": "95"}]
    circ_rates = []
    final_data = {"Pumps": pumps, "DrillingCircRates": circ_rates}
    df = pd.DataFrame(pumps)
    return final_data, df


def extract_bha_data_process(dummy_arg=None):
    bha_img_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_11.png"
    return process_bha(bha_img_path, debug=False)

# ---------------------------------------------------------------------
# 9) process Runner Helper Function
# ---------------------------------------------------------------------
def run_process(name, pipe, debug=False):
    logger.info(f"Running process: {name}")
    try:
        output_json, output_df = pipe["func"](None)
        output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
        os.makedirs(output_folder, exist_ok=True)
        json_path = os.path.join(output_folder, pipe["json"])
        with open(json_path, "w") as f:
            json.dump(output_json, f, indent=4)
        logger.info(f"{name} JSON data saved to {json_path}")
        csv_path = os.path.join(output_folder, pipe["csv"])
        output_df.to_csv(csv_path, index=False)
        logger.info(f"{name} CSV data saved to {csv_path}")
        print(f"\n--- {name.upper()} JSON Output ---")
        print(json.dumps(output_json, indent=4))
        print(f"\n--- {name.upper()} DataFrame ---")
        print(output_df)
    except Exception as e:
        logger.error(f"Error in process '{name}': {e}")

# ---------------------------------------------------------------------
# 10) Main process Function with processs Dictionary
# ---------------------------------------------------------------------
def main():
    debug = False  # Set True for detailed logging and image display
    processs = {
        "cost_data": {
            "func": process_cost_data,
            "csv": "cost_data.csv",
            "json": "cost_data.json"
        },
        "well_job": {
            "func": lambda d: process_well_job_info("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png", d),
            "csv": "well_job_data.csv",
            "json": "well_job_data.json"
        },
        "obs_int": {
            "func": lambda d: process_obs_int("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png", d),
            "csv": "obs_int_data.csv",
            "json": "obs_int_data.json"
        },
        "bop": {
            "func": lambda d: process_bop("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png", debug),
            "csv": "bop_data.csv",
            "json": "bop_data.json"
        },
        "dir_info": {
            "func": lambda d: process_dir_info("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png", d),
            "csv": "dir_info_data.csv",
            "json": "dir_info_data.json"
        },
        "survey": {
            "func": lambda d: process_survey("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png", debug),
            "csv": "survey_data.csv",
            "json": "survey_data.json"
        },
        "casing": {
            "func": process_casing_data,
            "csv": "casing_data.csv",
            "json": "casing_data.json"
        },
        "consumables": {
            "func": process_consumables_data,
            "csv": "consumables_data.csv",
            "json": "consumables_data.json"
        },
        "mud": {
            "func": process_mud_data,
            "csv": "mud_data.csv",
            "json": "mud_data.json"
        },
        "bha": {
            "func": process_bha_data,
            "csv": "bha_data.csv",
            "json": "bha_data.json"
        },
        "pumps": {
            "func": process_pumps_data,
            "csv": "pumps_data.csv",
            "json": "pumps_data.json"
        },
        "personnel": {
            "func": lambda d: process_personnel("dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_9.png", debug),
            "csv": "personnel_data.csv",
            "json": "personnel_data.json"
        },
        "time_breakdown": {
            "func": lambda d: main_time_breakdown_process() if main_time_breakdown_process() else ({"TIME BREAKDOWN": []}, pd.DataFrame()),
            "csv": "time_breakdown_data.csv",
            "json": "time_breakdown_data.json"
        }
    }
    
    for name, pipe in processs.items():
        run_process(name, pipe, debug)

if __name__ == "__main__":
    main()


ERROR:__main__:Error in process 'cost_data': cannot unpack non-iterable NoneType object
ERROR:__main__:Error in process 'well_job': cannot unpack non-iterable NoneType object
ERROR:__main__:Error in process 'obs_int': cannot unpack non-iterable NoneType object
ERROR:__main__:Error in process 'bop': name 'read_pil_image' is not defined
ERROR:__main__:Error in process 'dir_info': cannot unpack non-iterable NoneType object
ERROR:__main__:Error in process 'casing': cannot unpack non-iterable NoneType object
ERROR:__main__:Error in process 'consumables': cannot unpack non-iterable NoneType object



--- SURVEY JSON Output ---
{
    "SURVEY DATA": [
        {
            "MD": "20,286",
            "Inclination": "89.20",
            "Azimuth": "179.98",
            "DLS": "0.67",
            "TVD": "10,716"
        },
        {
            "MD": "20,191",
            "Inclination": "89.23",
            "Azimuth": "179.34",
            "DLS": "0.51",
            "TVD": "10,715"
        },
        {
            "MD": "20,096",
            "Inclination": "89.65",
            "Azimuth": "179.59",
            "DLS": "0.55",
            "TVD": "10,714"
        },
        {
            "MD": "20,001",
            "Inclination": "89.65",
            "Azimuth": "180.11",
            "DLS": "0.34",
            "TVD": "10,714"
        },
        {
            "MD": "19,906",
            "Inclination": "89.76",
            "Azimuth": "179.81",
            "DLS": "0.15",
            "TVD": "10,713"
        }
    ]
}

--- SURVEY DataFrame ---
       MD Inclination Azimuth   DLS     TVD
0  20,2

ERROR:__main__:Error in process 'bha': extract_bha_data() got an unexpected keyword argument 'debug'
ERROR:__main__:Error in process 'pumps': name 'read_pil_image' is not defined
ERROR:__main__:Error in process 'personnel': cannot unpack non-iterable NoneType object
ERROR:__main__:Error in process 'time_breakdown': name 'show_image' is not defined



--- MUD JSON Output ---
{
    "MUD": {
        "Type": "OBM",
        "Weight In": "11.5",
        "Weight Out": "11.5",
        "pH": "[BLANK]",
        "CAKE": "3",
        "GELS (10s/10m/30m)": {
            "10s": "8",
            "10m": "25",
            "30m": "27"
        },
        "Oil/Water": "88/12",
        "FV": "60.0",
        "ES": "753",
        "PV": "16",
        "YP": "8",
        "CL": "31,000",
        "Ca": "326,667",
        "LGS": "4.47",
        "WL": "[BLANK]",
        "HTHP Loss": "5.00",
        "3 RPM": "4",
        "6 RPM": "5",
        "Mud Pits and Hole Volume": "1023",
        "24 Hr Loss": "13",
        "Total Loss": "481",
        "Comments": "[BLANK]"
    }
}

--- MUD DataFrame ---
                         Key                                   Value
0                       Type                                     OBM
1                  Weight In                                    11.5
2                 Weight Out                                    1

In [0]:
import os# ---------------------------------------------------------------------
# Preliminary Check: Ensure Required Functions are Defined
# ---------------------------------------------------------------------
required_functions = [
    "process_daily_drilling_report",
    "process_well_job_info",
    "process_mud",
    "process_survey",
    "process_dir_info",
    "process_drill_bits",
    "process_casing_data",
    "process_bop",
    "process_personnel",
    "process_obs_int",
    "extract_bha_data_process",
    "process_pumps",
    "process_cost_data",
    "main_time_breakdown_process",
    "process_consumables_data"
]

for func_name in required_functions:
    if func_name not in globals():
        raise ImportError(f"{func_name} is not defined. Please define or import it before running this script.")

# ---------------------------------------------------------------------
# Logging and Utility Setup
# ---------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

def dbfs_to_local_path(dbfs_path):
    """
    Convert a DBFS path to a local filesystem path.
    Avoids double conversion if already converted.
    """
    if dbfs_path.startswith("dbfs:"):
        return dbfs_path.replace("dbfs:", "/mnt")
    return dbfs_path

def save_output(section_name, data_json, df, output_folder):
    """
    Save the JSON and CSV outputs for a given process section.
    Filenames are generated based on the section name.
    """
    file_base = section_name.lower().replace(" ", "_").replace(":", "").replace("&", "and")
    json_path = os.path.join(output_folder, f"{file_base}_data.json")
    csv_path = os.path.join(output_folder, f"{file_base}_data.csv")
    with open(json_path, "w") as f:
        json.dump(data_json, f, indent=4)
    df.to_csv(csv_path, index=False)
    logger.info(f"{section_name} data saved to {json_path} and {csv_path}")

# ---------------------------------------------------------------------
# Main process Execution
# ---------------------------------------------------------------------
def main():
    debug = True  # Set to False for less verbose logging

    # Define DBFS image paths for sections that require an image.
    image_paths = {
        "DAILY DRILLING REPORT": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_1.png",
        "WELL/JOB INFORMATION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png",
        "MUD": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png",
        "SURVEY DATA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png",
        "DIR INFO": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png",
        "DRILL BITS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png",
        "BOP": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png",
        "PERSONNEL": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_9.png",
        "DAILY NUMBERS: OBSERVATION & INTERVENTION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png",
        "BHA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_11.png",
        "PUMPS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    }

    # Define the output folder and create it if it doesn't exist.
    output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
    os.makedirs(output_folder, exist_ok=True)

    # Define the processs in the specified order.
    # For sections that require an image, the corresponding DBFS image path is provided.
    # For sections that do not require an image, pass None.
    processs = [
        ("DAILY DRILLING REPORT", process_daily_drilling_report, image_paths.get("DAILY DRILLING REPORT")),
        ("WELL/JOB INFORMATION", process_well_job_info, image_paths.get("WELL/JOB INFORMATION")),
        ("MUD", process_mud, image_paths.get("MUD")),
        ("SURVEY DATA", process_survey, image_paths.get("SURVEY DATA")),
        ("DIR INFO", process_dir_info, image_paths.get("DIR INFO")),
        ("DRILL BITS", process_drill_bits, image_paths.get("DRILL BITS")),
        ("CASING", process_casing_data, None),
        ("BOP", process_bop, image_paths.get("BOP")),
        ("PERSONNEL", process_personnel, image_paths.get("PERSONNEL")),
        ("DAILY NUMBERS: OBSERVATION & INTERVENTION", process_obs_int, image_paths.get("DAILY NUMBERS: OBSERVATION & INTERVENTION")),
        ("BHA", extract_bha_data_process, image_paths.get("BHA")),
        ("PUMPS", process_pumps, image_paths.get("PUMPS")),
        ("COST DATA", process_cost_data, None),
        ("TIME BREAKDOWN", main_time_breakdown_process, None),
        ("CONSUMABLES", process_consumables_data, None)
    ]

    # Process each process sequentially.
    for section_name, func, image_path in processs:
        try:
            logger.info(f"Processing section: {section_name}")
            # If an image path is provided, call the function with the image path and debug flag.
            if image_path:
                data_json, df = func(image_path, debug)
            else:
                data_json, df = func(debug)
            logger.info(f"{section_name} output:\n{json.dumps(data_json, indent=4)}")
            save_output(section_name, data_json, df, output_folder)
        except Exception as e:
            logger.error(f"{section_name} processing failed: {e}")

if __name__ == "__main__":
    main()


ERROR:__main__:DAILY DRILLING REPORT processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:WELL/JOB INFORMATION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:MUD processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:SURVEY DATA processing failed: name 'show_image' is not defined
ERROR:__main__:DIR INFO processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DRILL BITS processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:CASING processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BOP processing failed: name 'read_pil_image' is not defined
ERROR:__main__:PERSONNEL processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DAILY NUMBERS: OBSERVATION & INTERVENTION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BHA processing failed: extract_bha_data_process() takes from 0 to 1 positional arguments 

In [0]:
import os
import json
import logging
import pandas as pd

# ---------------------------------------------------------------------
# Logging and Utility Setup
# ---------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

def dbfs_to_local_path(dbfs_path):
    """
    Convert a DBFS path to a local filesystem path.
    Avoids double conversion if already converted.
    """
    if dbfs_path.startswith("dbfs:"):
        return dbfs_path.replace("dbfs:", "/mnt")
    return dbfs_path

def save_output(section_name, data_json, df, output_folder):
    """
    Save the JSON and CSV outputs for a given process section.
    Filenames are generated based on the section name.
    """
    # Remove or replace characters to form a valid file base name
    file_base = section_name.lower().replace(" ", "_").replace(":", "").replace("&", "and")
    json_path = os.path.join(output_folder, f"{file_base}_data.json")
    csv_path = os.path.join(output_folder, f"{file_base}_data.csv")
    with open(json_path, "w") as f:
        json.dump(data_json, f, indent=4)
    df.to_csv(csv_path, index=False)
    logger.info(f"{section_name} data saved to {json_path} and {csv_path}")

# ---------------------------------------------------------------------
# Main process Execution
# ---------------------------------------------------------------------
def main():
    debug = True  # Set to False if less verbose logging is preferred

    # Define DBFS image paths for sections that require an image.
    # Adjust these paths to match your environment.
    image_paths = {
        "DAILY DRILLING REPORT": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_1.png",
        "WELL/JOB INFORMATION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png",
        "MUD": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png",
        "SURVEY DATA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png",
        "DIR INFO": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png",
        "DRILL BITS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png",
        "BOP": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png",
        "PERSONNEL": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_9.png",
        "DAILY NUMBERS: OBSERVATION & INTERVENTION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png",
        "BHA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_11.png",
        "PUMPS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    }

    # Define the output folder and create it if it doesn't exist.
    output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
    os.makedirs(output_folder, exist_ok=True)

    # Define the processs in the specified order.
    # For sections that require an image, the corresponding DBFS image path is provided.
    # For sections that do not require an image, pass None.
    processs = [
        ("DAILY DRILLING REPORT",       process_daily_drilling_report,       image_paths.get("DAILY DRILLING REPORT")),
        ("WELL/JOB INFORMATION",         process_well_job_info,               image_paths.get("WELL/JOB INFORMATION")),
        ("MUD",                        process_mud,                         image_paths.get("MUD")),
        ("SURVEY DATA",                process_survey,                      image_paths.get("SURVEY DATA")),
        ("DIR INFO",                   process_dir_info,                    image_paths.get("DIR INFO")),
        ("DRILL BITS",                 process_drill_bits,                  image_paths.get("DRILL BITS")),
        ("CASING",                     process_casing_data,                None),
        ("BOP",                        process_bop,                         image_paths.get("BOP")),
        ("PERSONNEL",                  process_personnel,                   image_paths.get("PERSONNEL")),
        ("DAILY NUMBERS: OBSERVATION & INTERVENTION", process_obs_int,         image_paths.get("DAILY NUMBERS: OBSERVATION & INTERVENTION")),
        ("BHA",                        extract_bha_data_process,           image_paths.get("BHA")),
        ("PUMPS",                      process_pumps,                       image_paths.get("PUMPS")),
        ("COST DATA",                  process_cost_data,                  None),
        ("TIME BREAKDOWN",             main_time_breakdown_process,        None),
        ("CONSUMABLES",                process_consumables_data,           None)
    ]

    # Process each process sequentially.
    for section_name, func, image_path in processs:
        try:
            logger.info(f"Processing section: {section_name}")
            # Call the function with the image path if provided; otherwise, call with debug flag only.
            if image_path:
                data_json, df = func(image_path, debug)
            else:
                data_json, df = func(debug)
            logger.info(f"{section_name} output:\n{json.dumps(data_json, indent=4)}")
            save_output(section_name, data_json, df, output_folder)
        except Exception as e:
            logger.error(f"{section_name} processing failed: {e}")

if __name__ == "__main__":
    main()


ERROR:__main__:DAILY DRILLING REPORT processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:WELL/JOB INFORMATION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:MUD processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:SURVEY DATA processing failed: name 'show_image' is not defined
ERROR:__main__:DIR INFO processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DRILL BITS processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:CASING processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BOP processing failed: name 'read_pil_image' is not defined
ERROR:__main__:PERSONNEL processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DAILY NUMBERS: OBSERVATION & INTERVENTION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BHA processing failed: extract_bha_data_process() takes from 0 to 1 positional arguments 

In [0]:
import os
import json
import logging
import pandas as pd

# ---------------------------------------------------------------------
# Ensure that the following functions are defined in your environment:
# process_daily_drilling_report, process_well_job_info, process_mud,
# process_survey, process_dir_info, process_drill_bits, process_casing_data,
# process_bop, process_personnel, process_obs_int, extract_bha_data_process,
# process_pumps, process_cost_data, main_time_breakdown_process,
# process_consumables_data
#
# For example, you might have:
# from your_module import process_daily_drilling_report, process_well_job_info, ... 
# ---------------------------------------------------------------------

# ---------------------------------------------------------------------
# Logging and Utility Setup
# ---------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

def dbfs_to_local_path(dbfs_path):
    """
    Convert a DBFS path to a local filesystem path.
    Avoids double conversion if already converted.
    """
    if dbfs_path.startswith("dbfs:"):
        return dbfs_path.replace("dbfs:", "/mnt")
    return dbfs_path

def save_output(section_name, data_json, df, output_folder):
    """
    Save the JSON and CSV outputs for a given process section.
    Filenames are generated based on the section name.
    """
    file_base = section_name.lower().replace(" ", "_").replace(":", "").replace("&", "and")
    json_path = os.path.join(output_folder, f"{file_base}_data.json")
    csv_path = os.path.join(output_folder, f"{file_base}_data.csv")
    with open(json_path, "w") as f:
        json.dump(data_json, f, indent=4)
    df.to_csv(csv_path, index=False)
    logger.info(f"{section_name} data saved to {json_path} and {csv_path}")

# ---------------------------------------------------------------------
# Main process Execution
# ---------------------------------------------------------------------
def main():
    debug = True  # Set to False for less verbose logging

    # Define DBFS image paths for sections that require an image.
    image_paths = {
        "DAILY DRILLING REPORT": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_1.png",
        "WELL/JOB INFORMATION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png",
        "MUD": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png",
        "SURVEY DATA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png",
        "DIR INFO": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png",
        "DRILL BITS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png",
        "BOP": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png",
        "PERSONNEL": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_9.png",
        "DAILY NUMBERS: OBSERVATION & INTERVENTION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png",
        "BHA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_11.png",
        "PUMPS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    }

    # Define the output folder and create it if it doesn't exist.
    output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
    os.makedirs(output_folder, exist_ok=True)

    # Define the processs in the specified order.
    # For sections that require an image, the corresponding DBFS image path is provided.
    # For sections that do not require an image, pass None.
    processs = [
        ("DAILY DRILLING REPORT", process_daily_drilling_report, image_paths.get("DAILY DRILLING REPORT")),
        ("WELL/JOB INFORMATION", process_well_job_info, image_paths.get("WELL/JOB INFORMATION")),
        ("MUD", process_mud, image_paths.get("MUD")),
        ("SURVEY DATA", process_survey, image_paths.get("SURVEY DATA")),
        ("DIR INFO", process_dir_info, image_paths.get("DIR INFO")),
        ("DRILL BITS", process_drill_bits, image_paths.get("DRILL BITS")),
        ("CASING", process_casing_data, None),
        ("BOP", process_bop, image_paths.get("BOP")),
        ("PERSONNEL", process_personnel, image_paths.get("PERSONNEL")),
        ("DAILY NUMBERS: OBSERVATION & INTERVENTION", process_obs_int, image_paths.get("DAILY NUMBERS: OBSERVATION & INTERVENTION")),
        ("BHA", extract_bha_data_process, image_paths.get("BHA")),
        ("PUMPS", process_pumps, image_paths.get("PUMPS")),
        ("COST DATA", process_cost_data, None),
        ("TIME BREAKDOWN", main_time_breakdown_process, None),
        ("CONSUMABLES", process_consumables_data, None)
    ]

    # Process each process sequentially.
    for section_name, func, image_path in processs:
        try:
            logger.info(f"Processing section: {section_name}")
            # If an image path is provided, call the function with the image path and debug flag.
            if image_path:
                data_json, df = func(image_path, debug)
            else:
                data_json, df = func(debug)
            logger.info(f"{section_name} output:\n{json.dumps(data_json, indent=4)}")
            save_output(section_name, data_json, df, output_folder)
        except Exception as e:
            logger.error(f"{section_name} processing failed: {e}")

if __name__ == "__main__":
    main()


ERROR:__main__:DAILY DRILLING REPORT processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:WELL/JOB INFORMATION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:MUD processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:SURVEY DATA processing failed: name 'show_image' is not defined
ERROR:__main__:DIR INFO processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DRILL BITS processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:CASING processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BOP processing failed: name 'read_pil_image' is not defined
ERROR:__main__:PERSONNEL processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DAILY NUMBERS: OBSERVATION & INTERVENTION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BHA processing failed: extract_bha_data_process() takes from 0 to 1 positional arguments 

In [0]:
import os
import json
import logging
import pandas as pd

# ---------------------------------------------------------------------
# Logging and Utility Setup
# ---------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

def dbfs_to_local_path(dbfs_path):
    """
    Convert a DBFS path to a local filesystem path.
    Avoid double conversion if already converted.
    """
    if dbfs_path.startswith("dbfs:"):
        return dbfs_path.replace("dbfs:", "/mnt")
    return dbfs_path

def save_output(section_name, data_json, df, output_folder):
    """
    Save the JSON and CSV output for a given process section.
    """
    json_path = os.path.join(output_folder, f"{section_name.lower().replace(' ', '_')}_data.json")
    csv_path  = os.path.join(output_folder, f"{section_name.lower().replace(' ', '_')}_data.csv")
    with open(json_path, "w") as f:
        json.dump(data_json, f, indent=4)
    df.to_csv(csv_path, index=False)
    logger.info(f"{section_name} data saved to {json_path} and {csv_path}")

# ---------------------------------------------------------------------
# Main process Execution
# ---------------------------------------------------------------------
def main():
    debug = True  # Set to False for less verbose logging
    
    # Define image paths for sections that require an image.
    # Adjust these paths as needed.
    paths = {
        "DAILY DRILLING REPORT": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_DDR.png",
        "WELL/JOB INFORMATION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png",
        "MUD": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_3.png",
        "SURVEY DATA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_4.png",
        "DIR INFO": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_5.png",
        "DRILL BITS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_6.png",
        "BOP": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png",
        "PERSONNEL": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_9.png",
        "DAILY NUMBERS: OBSERVATION & INTERVENTION": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_10.png",
        "BHA": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_11.png",
        "PUMPS": "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"
    }
    
    # Define the output folder and create it if it doesn't exist.
    output_folder = dbfs_to_local_path("dbfs:/mnt/mini-proj-dd/final_results")
    os.makedirs(output_folder, exist_ok=True)
    
    # List of processs in the specified order.
    # For sections that do not require an image, pass None.
    processs = [
        ("DAILY DRILLING REPORT",       process_daily_drilling_report,       paths.get("DAILY DRILLING REPORT")),
        ("WELL/JOB INFORMATION",         process_well_job_info,               paths.get("WELL/JOB INFORMATION")),
        ("MUD",                        process_mud,                         paths.get("MUD")),
        ("SURVEY DATA",                process_survey,                      paths.get("SURVEY DATA")),
        ("DIR INFO",                   process_dir_info,                    paths.get("DIR INFO")),
        ("DRILL BITS",                 process_drill_bits,                  paths.get("DRILL BITS")),
        ("CASING",                     process_casing_data,                None),
        ("BOP",                        process_bop,                         paths.get("BOP")),
        ("PERSONNEL",                  process_personnel,                   paths.get("PERSONNEL")),
        ("DAILY NUMBERS: OBSERVATION & INTERVENTION", process_obs_int,         paths.get("DAILY NUMBERS: OBSERVATION & INTERVENTION")),
        ("BHA",                        extract_bha_data_process,           paths.get("BHA")),
        ("PUMPS",                      process_pumps,                       paths.get("PUMPS")),
        ("COST DATA",                  process_cost_data,                  None),
        ("TIME BREAKDOWN",             main_time_breakdown_process,        None),
        ("CONSUMABLES",                process_consumables_data,           None)
    ]
    
    # Process each process sequentially.
    for section_name, func, image_path in processs:
        try:
            # Call the function with the image path if provided, else pass None.
            if image_path:
                data_json, df = func(image_path, debug)
            else:
                data_json, df = func(debug)
            logger.info(f"===== {section_name.upper()} =====")
            logger.info(json.dumps(data_json, indent=4))
            save_output(section_name, data_json, df, output_folder)
        except Exception as e:
            logger.error(f"{section_name} processing failed: {e}")

if __name__ == "__main__":
    main()


ERROR:__main__:DAILY DRILLING REPORT processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:WELL/JOB INFORMATION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:MUD processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:SURVEY DATA processing failed: name 'show_image' is not defined
ERROR:__main__:DIR INFO processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DRILL BITS processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:CASING processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BOP processing failed: name 'read_pil_image' is not defined
ERROR:__main__:PERSONNEL processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:DAILY NUMBERS: OBSERVATION & INTERVENTION processing failed: cannot unpack non-iterable NoneType object
ERROR:__main__:BHA processing failed: extract_bha_data_process() takes from 0 to 1 positional arguments 

In [0]:
# [
#   "DAILY DRILLING REPORT",
#   "WELL/JOB INFORMATION",
#   "MUD",
#   "SURVEY DATA",
#   "DIR INFO",
#   "DRILL BITS",
#   "CASING",
#   "BOP",
#   "PERSONNEL",
#   "DAILY NUMBERS: OBSERVATION & INTERVENTION",
#   "BHA",
#   "PUMPS",
#   "COST DATA",
#   "TIME BREAKDOWN",
#   "CONSUMABLES"
# ]
