In [0]:
#%run ./init

In [0]:
import pytesseract
import re
import json
from PIL import Image

def extract_bha_data(image_path):
    # Load image and perform OCR
    image = Image.open(image_path)
    ocr_text = pytesseract.image_to_string(image)
    
    # Define regex patterns to extract key values without repetition
    patterns = {
        "Drill Pipe Detail": r"Drill Pipe Detail:\s*([^\n]+)",  # Extracts full text but **won't duplicate fields**
        "Size": r"Size:\s*([\d.]+)\b",
        "Wt./Ft": r"Wt\./Ft:\s*([\d.]+)\b",
        "Connection": r"Connection:\s*([\w\d-]+)\b",
        "ID": r"ID:\s*([\d.]+)\b",
        "Drill Bit": r"Drill Bit:\s*([^\n;]+)",
        "Motor": r"Motor:\s*([^\n;]+)",
        "MWD Tool": r"MWD Tool:\s*([^\n;]+)",
        "Monel Collar": r"Monel Collar:\s*([^\n;]+)",
        "X-Over": r"X-Over:\s*([^\n;]+)",
        "Sub": r"Sub:\s*([^\n;]+)",
        "HWDP": r"HWDP:\s*([^\n;]+)",
        "Drill Pipe": r"Drill Pipe:\s*([\d.]+(?:\" DP)?)",  
        "Reamer": r"Reamer:\s*([^\n;]+)",
        "Shock Sub": r"Shock Sub:\s*([^\n;]+)",
        "Total Length": r"Total Length:\s*(\d+)\b"
    }
    
    # Extract data
    bha_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, ocr_text)
        if match:
            bha_data[key] = match.group(1).strip()
    
    # **Fix duplication issue:** Remove Size, Wt./Ft, Connection, ID from `"Drill Pipe Detail"`
    if "Drill Pipe Detail" in bha_data:
        detail = bha_data["Drill Pipe Detail"]
        for remove_key in ["Size", "Wt./Ft", "Connection", "ID"]:
            if remove_key in bha_data:
                detail = re.sub(rf"{remove_key}:\s*{re.escape(bha_data[remove_key])}", "", detail).strip(",; ")
        bha_data["Drill Pipe Detail"] = detail  # Store cleaned version

    # **Final structured JSON without repetition**
    structured_data = {
        "BHA": {
            "Drill Pipe Detail": bha_data.get("Drill Pipe Detail", ""),
            "Size": bha_data.get("Size", ""),
            "Wt./Ft": bha_data.get("Wt./Ft", ""),
            "Connection": bha_data.get("Connection", ""),
            "ID": bha_data.get("ID", ""),
            "BHA #4": {
                "Drill Bit": bha_data.get("Drill Bit", ""),
                "Motor": bha_data.get("Motor", ""),
                "MWD Tool": bha_data.get("MWD Tool", ""),
                "Monel Collar": bha_data.get("Monel Collar", ""),
                "X-Over": bha_data.get("X-Over", ""),
                "Sub": bha_data.get("Sub", ""),
                "HWDP": bha_data.get("HWDP", ""),
                "Drill Pipe": bha_data.get("Drill Pipe", ""),
                "Reamer": bha_data.get("Reamer", ""),
                "Shock Sub": bha_data.get("Shock Sub", "")
            },
            "Total Length": bha_data.get("Total Length", "")  # ✅ Now correctly placed at the end
        }
    }
    
    return structured_data

def main():
    image_path = "/dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_11.png"  # Change this to your actual image path
    bha_json = extract_bha_data(image_path)
    print(json.dumps(bha_json, indent=4))

if __name__ == "__main__":
    main()


In [0]:
import os
import re
import pytesseract
import logging
import json
import pandas as pd
from PIL import Image

# ------------------------------------------------------------------
# 1) Minimal Logger Configuration
# ------------------------------------------------------------------
logger = logging.getLogger("PumpExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# ------------------------------------------------------------------
# 2) Read Image
# ------------------------------------------------------------------
def read_image(image_path):
    """
    Reads the image from local or DBFS path and returns a PIL Image.
    """
    # If the path starts with "dbfs:/", convert to "/dbfs/..." path
    if image_path.startswith("dbfs:/"):
        # remove the "dbfs:" prefix => so "dbfs:/mnt/..." becomes "/mnt/..."
        stripped = image_path.replace("dbfs:", "")  # => "/mnt/mini-proj-dd/..."
        # prepend "/dbfs" => "/dbfs/mnt/mini-proj-dd/..."
        local_path = "/dbfs" + stripped
    else:
        # otherwise, assume it's a normal local path
        local_path = image_path

    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")

    from PIL import Image
    img = Image.open(local_path)
    logger.info(f"Image loaded from {local_path} with size {img.size}")
    return img

# ------------------------------------------------------------------
# 3) Perform OCR
# ------------------------------------------------------------------
def perform_ocr(img):
    """
    Performs OCR on the given PIL image, returning raw text.
    """
    text = pytesseract.image_to_string(img)
    logger.info("OCR extraction complete.")
    return text

# ------------------------------------------------------------------
# 4) Parse Pumps Table
# ------------------------------------------------------------------
def parse_pumps_table(ocr_text):
    """
    Parses the pumps table from the OCR text.
    Expected lines look like:
      Number Model Type   HHP  Efficiency  Stroke(in)  Liner(in)  P-Rating(psi)  P-Limit(psi)  SPM Rating  SPM Limit
      1      BOMCO TRIPLEX 1600 95       12.000       4.75       7500           7100          120         110
      2      BOMCO TRIPLEX 1600 95       12.000       4.75       7500           7100          120         110
      (possibly missing fields in some rows)
    """

    # We’ll search for lines that look like:
    #   <Number> BOMCO TRIPLEX <HHP> <Eff> <Stroke> <Liner> <P-Rating> <P-Limit> <SPM Rating> <SPM Limit>
    #   or possibly missing the Number or HHP.
    # We'll capture them with a regex that checks for 8-11 columns.
    # You can refine further as needed.
    pump_pattern = re.compile(
        r"^(\d+)?\s*"               # Number (optional)
        r"(BOMCO)\s+(TRIPLEX)\s+"    # Model, Type
        r"(\d+)?\s*"                 # HHP (optional)
        r"(\d+)\s+"                  # Efficiency
        r"([\d.]+)\s+"               # Stroke(in)
        r"([\d.]+)\s+"               # Liner(in)
        r"(\d+)\s+"                  # P-Rating(psi)
        r"(\d+)\s+"                  # P-Limit(psi)
        r"(\d+)\s+"                  # SPM Rating
        r"(\d+)\s*$",                # SPM Limit
        re.IGNORECASE
    )

    lines = ocr_text.splitlines()
    pumps = []

    for line in lines:
        line = line.strip()
        match = pump_pattern.match(line)
        if match:
            # Extract fields
            number, model, pump_type, hhp, efficiency, stroke, liner, p_rating, p_limit, spm_rating, spm_limit = match.groups()

            # Store as dictionary
            pumps.append({
                "Number": number if number else "",
                "Model": model,
                "Type": pump_type,
                "HHP": hhp if hhp else "",
                "Efficiency": efficiency,
                "Stroke(in)": stroke,
                "Liner(in)": liner,
                "P-Rating(psi)": p_rating,
                "P-Limit(psi)": p_limit,
                "SPM Rating": spm_rating,
                "SPM Limit": spm_limit
            })

    return pumps

# ------------------------------------------------------------------
# 5) Parse Drilling/Circ Rates
# ------------------------------------------------------------------
def parse_drilling_circ_rates(ocr_text):
    """
    Parses lines like:
      Drilling/Circ Rate 1 4325 PSI @ 134 SPM 2.63 Gal/Stoke 351.76 GPM 8.38 BPM 468.11 DC 340.61 DP
      Drilling/Circ Rate 2 4475 PSI @ 134 SPM 2.63 Gal/Stoke 351.76 GPM 8.38 BPM 468.11 DC 340.61 DP
    We'll store them in a structured list of dicts.
    """

    # We'll define a pattern capturing Rate #, Pressure, SPM, Gal/Stoke, GPM, BPM, DC, DP, etc.
    # Example line:
    #   Drilling/Circ Rate 1 4325 PSI @ 134 SPM 2.63 Gal/Stoke 351.76 GPM 8.38 BPM 468.11 DC 340.61 DP
    circ_pattern = re.compile(
        r"Drilling/Circ\s+Rate\s+(\d+)\s+(\d+)\s+PSI\s*@\s*(\d+)\s*SPM\s*([\d.]+)\s+Gal/Stoke\s+([\d.]+)\s+GPM\s+([\d.]+)\s+BPM\s+([\d.]+)\s+DC\s+([\d.]+)\s+DP",
        re.IGNORECASE
    )

    lines = ocr_text.splitlines()
    circ_rates = []

    for line in lines:
        line = line.strip()
        match = circ_pattern.search(line)
        if match:
            rate_id, pressure, spm, gal_stroke, gpm, bpm, dc, dp = match.groups()
            circ_rates.append({
                "RateID": rate_id,
                "Pressure(PSI)": pressure,
                "SPM": spm,
                "Gal/Stoke": gal_stroke,
                "GPM": gpm,
                "BPM": bpm,
                "DC": dc,
                "DP": dp
            })

    return circ_rates

# ------------------------------------------------------------------
# 6) Main Pipeline
# ------------------------------------------------------------------
def main_pipeline():
    # 1) Load image
    image_path = "/dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_12.png"  # Adjust as needed
    try:
        img = read_image(image_path)
    except FileNotFoundError as e:
        logger.error(e)
        return
    
    # 2) Perform OCR
    ocr_text = perform_ocr(img)
    logger.info(f"OCR Text:\n{ocr_text}\n")

    # 3) Parse the two sections
    pumps = parse_pumps_table(ocr_text)
    circ_rates = parse_drilling_circ_rates(ocr_text)

    # 4) Convert to DataFrames
    df_pumps = pd.DataFrame(pumps)
    df_circ = pd.DataFrame(circ_rates)

    # 5) Build final JSON structure
    final_data = {
        "Pumps": pumps,
        "DrillingCircRates": circ_rates
    }

    # 6) Print or display results
    logger.info("=== Pumps DataFrame ===")
    print(df_pumps)
    logger.info("=== Drilling/Circ Rates DataFrame ===")
    print(df_circ)

    logger.info("=== Final JSON ===")
    print(json.dumps(final_data, indent=4))

    # 7) Save to CSV and JSON
    output_folder = "/dbfs/mnt/data/final_results_pumps"
    os.makedirs(output_folder, exist_ok=True)

    csv_pumps_path = os.path.join(output_folder, "pumps.csv")
    csv_circ_path = os.path.join(output_folder, "drilling_circ_rates.csv")
    json_path = os.path.join(output_folder, "pumps_drilling_circ.json")

    df_pumps.to_csv(csv_pumps_path, index=False)
    df_circ.to_csv(csv_circ_path, index=False)

    with open(json_path, "w") as f:
        json.dump(final_data, f, indent=4)

    logger.info(f"Pumps CSV saved to: {csv_pumps_path}")
    logger.info(f"Drilling/Circ Rates CSV saved to: {csv_circ_path}")
    logger.info(f"JSON saved to: {json_path}")



# Run if called directly
if __name__ == "__main__":
    main_pipeline()
