In [0]:
%run ./init

In [0]:
import cv2
import pytesseract
import numpy as np
import logging
import json
import os
import re
import pandas as pd

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

def copy_dbfs_to_local(dbfs_path, local_path):
    """
    Copies a file from DBFS to a local path using Databricks utilities.
    Only needed if your file is in dbfs:/; otherwise you can read directly from /dbfs/ path.
    """
    try:
        dbutils.fs.cp(dbfs_path, f"file:{local_path}")
        if not os.path.exists(local_path):
            raise FileNotFoundError(f"File was not copied properly to {local_path}")
        return local_path
    except Exception as e:
        raise FileNotFoundError(f"Failed to copy from {dbfs_path} to {local_path}. Error: {e}")

def read_image(image_path):
    """
    Reads an image from the local file system path.
    In Databricks, you can also do: image_path = '/dbfs/mnt/...'
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file not found at {image_path}")
    img = cv2.imread(image_path)
    if img is None:
        raise FileNotFoundError(f"Failed to read image file at {image_path}")
    return img

def draw_bounding_box_and_crop(image, top_right_coords, debug_path="/dbfs/mnt/mini-proj-dd/final_ocr_results/debug_top_right.png"):
    """
    Draws the bounding box for the top-right region on the original image
    and saves a debug image. Then returns the cropped region.
    top_right_coords = (x, y, w, h).
    """
    x, y, w, h = top_right_coords

    # Draw a rectangle on a copy for debugging
    debug_img = image.copy()
    cv2.rectangle(debug_img, (x, y), (x + w, y + h), (0, 255, 0), 2)

    # Save debug image so you can see the bounding box
    cv2.imwrite(debug_path, debug_img)
    logger.info(f"Debug image with bounding box saved to: {debug_path}")

    # Crop the top-right region
    cropped = image[y:y+h, x:x+w]
    return cropped

def preprocess_image_for_ocr(image):
    """
    Preprocesses the image for better OCR:
    1) Grayscale
    2) Histogram equalization
    3) Gaussian blur
    4) Adaptive threshold
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    equalized = cv2.equalizeHist(gray)
    blurred = cv2.GaussianBlur(equalized, (5, 5), 0)
    processed = cv2.adaptiveThreshold(
        blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    return processed

def perform_ocr(image):
    """
    Performs OCR on the provided image with a recommended config.
    """
    config = "--oem 3 --psm 6"  # OEM 3: LSTM, PSM 6: Assume a single uniform block
    text = pytesseract.image_to_string(image, config=config)
    return text

def extract_key_value_from_text(text, expected_keys):
    """
    Extracts key-value pairs from the OCR text using expected keys.
    If a key's value is empty, it assigns None.
    """
    # Normalize text: remove extra spaces and combine lines
    combined = " ".join(line.strip() for line in text.splitlines() if line.strip())
    combined = re.sub(r'\s+', ' ', combined)
    
    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = rf'{re.escape(key)}\s*:\s*(.*?)(?=\s*{re.escape(next_key)}\s*:|$)'
        else:
            pattern = rf'{re.escape(key)}\s*:\s*(.*)'
        match = re.search(pattern, combined, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            result[key] = value if value else None
        else:
            result[key] = None
    return result

def clean_report_num(report_num_str):
    """
    Removes trailing punctuation from the extracted 'Report Num' if present.
    Example: '11.' -> '11'
    """
    if not report_num_str:
        return report_num_str
    # Use regex to capture digits, ignoring trailing punctuation
    match = re.search(r'^(\d+)\.?$', report_num_str)
    if match:
        return match.group(1)
    return report_num_str

def main_pipeline():
    # Path in DBFS. If you prefer direct read, set image_path = "/dbfs/mnt/mini-proj-dd/cropped_sections/page_1_section_1.png"
    dbfs_image_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_1.png"
    local_image_path = "/tmp/page_1_section_1.png"

    try:
        # If your environment requires copying from DBFS to local:
        copy_dbfs_to_local(dbfs_image_path, local_image_path)
        img = read_image(local_image_path)
        logger.info("Image loaded successfully.")
    except FileNotFoundError as e:
        logger.error(e)
        return

    # Adjust bounding box to isolate only the top-right area with 'Report Date', 'Report Num', 'Rig'
    # Example coordinates: (1600, 0, 950, 185)  -> tune as needed
    top_right_coords = (1600, 0, 950, 185)

    # Draw bounding box, save debug image, then crop
    cropped_section = draw_bounding_box_and_crop(
        img,
        top_right_coords,
        debug_path="/dbfs/mnt/mini-proj-dd/final_ocr_results/debug_top_right.png"
    )

    # Preprocess for OCR
    processed_image = preprocess_image_for_ocr(cropped_section)
    logger.info("Top-right image preprocessed for OCR.")

    # Perform OCR
    ocr_text = perform_ocr(processed_image)
    logger.info("OCR extraction complete.")
    logger.info(f"OCR Text:\n{ocr_text}")

    # Extract expected keys
    expected_keys = ["Report Date", "Report Num", "Rig"]
    extracted = extract_key_value_from_text(ocr_text, expected_keys)

    # Clean trailing punctuation from Report Num
    if extracted["Report Num"] is not None:
        extracted["Report Num"] = clean_report_num(extracted["Report Num"])

    # Construct final dictionary
    final_dict = {"DAILY DRILLING REPORT": extracted}
    logger.info(json.dumps(final_dict, indent=4))
    print(json.dumps(final_dict, indent=4))

    # Save to CSV
    df_final = pd.DataFrame(list(extracted.items()), columns=["Key", "Value"])
    output_folder = "/dbfs/mnt/mini-proj-dd/final_ocr_results"
    os.makedirs(output_folder, exist_ok=True)
    output_file = os.path.join(output_folder, "page_1_section_ddr_ocr.csv")
    df_final.to_csv(output_file, index=False)
    logger.info(f"Final DataFrame saved to {output_file}")

if __name__ == "__main__":
    main_pipeline()


In [0]:
import os
import re
import cv2
import pytesseract
import pandas as pd
import logging
import json

# --------------------------------------------------------
# 1) Minimal Logger Configuration
# --------------------------------------------------------
logger = logging.getLogger("WellJobInfoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# --------------------------------------------------------
# 2) Read Image from DBFS/Local Path
# --------------------------------------------------------
def read_cropped_section_image(section_path):
    local_path = section_path
    if local_path.startswith("dbfs:"):
        local_path = local_path.replace("dbfs:", "")
    if local_path.startswith("/mnt/"):
        local_path = "/dbfs" + local_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise FileNotFoundError(f"OpenCV failed to load image: {local_path}")
    return img

# --------------------------------------------------------
# 3) OCR Extraction (Minimal Processing)
# --------------------------------------------------------
def perform_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Use psm 6: assume a uniform block of text
    text = pytesseract.image_to_string(gray, config='--psm 6')
    return text

# --------------------------------------------------------
# 4) Expected Keys Extraction from Combined OCR Text
# --------------------------------------------------------
def extract_key_value_from_text(text, expected_keys):
    """
    Combines all OCR text lines into a single string and, using the expected keys
    (in order), extracts each value as the text between the current key and the next key.
    If a key is not found, an empty string is returned.
    """
    # Combine all non-empty lines into one string.
    combined = " ".join(line.strip() for line in text.splitlines() if line.strip())
    # Normalize whitespace.
    combined = re.sub(r'\s+', ' ', combined)
    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = re.escape(key) + r'\s*:\s*(.*?)\s*(?=' + re.escape(next_key) + r'\s*:)'
        else:
            pattern = re.escape(key) + r'\s*:\s*(.*)$'
        match = re.search(pattern, combined, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            result[key] = value
        else:
            result[key] = ""
    return result

# --------------------------------------------------------
# 5) Main Pipeline
# --------------------------------------------------------
def main_pipeline():
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png"
    try:
        img = read_cropped_section_image(section_path)
        logger.info("Image loaded successfully.")
    except FileNotFoundError as e:
        logger.error(e)
        return

    ocr_text = perform_ocr(img)
    logger.info("OCR extraction complete.")
    logger.info(f"OCR Text:\n{ocr_text}")

    # Define the exact expected keys (in the desired order)
    expected_keys = [
        "Well Name",
        "Job Name",
        "Supervisor(s)",
        "Field",
        "Sec/Twn/Rng",
        "Phone",
        "AFE #",
        "API #",
        "Email",
        "Contractor",
        "Elevation",
        "RKB",
        "Spud Date",
        "Days from Spud",
        "Days on Loc",
        "MD/TVD",
        "24 Hr Footage",
        "Present Operations",
        "Activity Planned"
    ]

    extracted = extract_key_value_from_text(ocr_text, expected_keys)
    final_dict = {"WELL/JOB INFORMATION": extracted}
    print(json.dumps(final_dict, indent=4))

    df_final = pd.DataFrame(list(extracted.items()), columns=["Key", "Value"])
    try:
        display(df_final)
    except NameError:
        print(df_final)
    
    output_folder = "dbfs:/mnt/mini-proj-dd/final_ocr_results"
    local_output_folder = output_folder.replace("dbfs:", "/dbfs")
    os.makedirs(local_output_folder, exist_ok=True)
    output_file = os.path.join(local_output_folder, "page_1_section_2_ocr.csv")
    df_final.to_csv(output_file, index=False)
    logger.info(f"Final DataFrame saved to {output_file}")

if __name__ == "__main__":
    main_pipeline()


In [0]:
import os
import re
import cv2
import pytesseract
import pandas as pd
import logging
import json

# --------------------------------------------------------
# 1) Minimal Logger Configuration
# --------------------------------------------------------
logger = logging.getLogger("WellJobInfoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# --------------------------------------------------------
# 2) Read Image from DBFS/Local Path
# --------------------------------------------------------
def read_cropped_section_image(section_path):
    local_path = section_path
    if local_path.startswith("dbfs:"):
        local_path = local_path.replace("dbfs:", "")
    if local_path.startswith("/mnt/"):
        local_path = "/dbfs" + local_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise FileNotFoundError(f"OpenCV failed to load image: {local_path}")
    return img

# --------------------------------------------------------
# 3) OCR Extraction (Minimal Processing)
# --------------------------------------------------------
def perform_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config='--psm 6')
    return text

# --------------------------------------------------------
# 4) Extract BOP Information
# --------------------------------------------------------
def extract_bop_info(text):
    pattern = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in pattern.items():
        match = re.search(regex, text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    return result

# --------------------------------------------------------
# 5) Main Pipeline
# --------------------------------------------------------
def main_pipeline():
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png"
    try:
        img = read_cropped_section_image(section_path)
        logger.info("Image loaded successfully.")
    except FileNotFoundError as e:
        logger.error(e)
        return

    ocr_text = perform_ocr(img)
    logger.info("OCR extraction complete.")
    logger.info(f"OCR Text:\n{ocr_text}")

    extracted_bop = extract_bop_info(ocr_text)
    final_dict = {"BOP": extracted_bop}
    print(json.dumps(final_dict, indent=4))

    df_final = pd.DataFrame(list(extracted_bop.items()), columns=["Key", "Value"])
    try:
        display(df_final)
    except NameError:
        print(df_final)
    
    output_folder = "dbfs:/mnt/mini-proj-dd/final_ocr_results"
    local_output_folder = output_folder.replace("dbfs:", "/dbfs")
    os.makedirs(local_output_folder, exist_ok=True)
    output_file = os.path.join(local_output_folder, "page_1_section_8_bop_ocr.csv")
    df_final.to_csv(output_file, index=False)
    logger.info(f"Final DataFrame saved to {output_file}")

if __name__ == "__main__":
    main_pipeline()


In [0]:
import re
import json
import pandas as pd
import os
import os
import re
import cv2
import pytesseract
import pandas as pd
import logging
import json

# --------------------------------------------------------
# 1) Minimal Logger Configuration
# --------------------------------------------------------
logger = logging.getLogger("WellJobInfoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# --------------------------------------------------------
# 2) Read Image from DBFS/Local Path
# --------------------------------------------------------
def read_cropped_section_image(section_path):
    local_path = section_path
    if local_path.startswith("dbfs:"):
        local_path = local_path.replace("dbfs:", "")
    if local_path.startswith("/mnt/"):
        local_path = "/dbfs" + local_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise FileNotFoundError(f"OpenCV failed to load image: {local_path}")
    return img

# --------------------------------------------------------
# 3) OCR Extraction (Minimal Processing)
# --------------------------------------------------------
def perform_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config='--psm 6')
    return text

    
def extract_key_value_from_text(text, expected_keys):
    """
    Extracts key-value pairs from the text, ensuring that keys with empty values
    are correctly detected instead of capturing the next key as their value.
    """
    combined = " ".join(line.strip() for line in text.splitlines() if line.strip())
    combined = re.sub(r'\s+', ' ', combined)  # Normalize whitespace

    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = rf'{re.escape(key)}\s*:\s*(.*?)(?=\s*{re.escape(next_key)}\s*:|$)'
        else:
            pattern = rf'{re.escape(key)}\s*:\s*(.*)'

        match = re.search(pattern, combined, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            result[key] = value if value else None  # Assign None for empty values
        else:
            result[key] = None  # Explicitly mark missing values

    return result

# --------------------------------------------------------
# 5) Main Pipeline for Cost Data Extraction
# --------------------------------------------------------
def main_pipeline():
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png"
    
    try:
        img = read_cropped_section_image(section_path)
        logger.info("Image loaded successfully.")
    except FileNotFoundError as e:
        logger.error(e)
        return

    ocr_text = perform_ocr(img)
    logger.info("OCR extraction complete.")
    logger.info(f"OCR Text:\n{ocr_text}")

    expected_keys = [
        "Drilling AFE Amount",
        "Daily Drilling Cost",
        "Cumulative Drilling Cost",
        "Cumulative Well Cost",
        "Daily Mud Cost",
        "Cumulative Mud Cost"
    ]

    extracted = extract_key_value_from_text(ocr_text, expected_keys)
    final_dict = {"COST DATA": extracted}
    logger.info(json.dumps(final_dict, indent=4))
    print(json.dumps(final_dict, indent=4))

    df_final = pd.DataFrame(list(extracted.items()), columns=["Key", "Value"])
    try:
        display(df_final)
    except NameError:
        print(df_final)
    
    output_folder = "/dbfs/mnt/mini-proj-dd/final_ocr_results"
    os.makedirs(output_folder, exist_ok=True)
    output_file = os.path.join(output_folder, "page_1_section_cost_data_ocr.csv")
    df_final.to_csv(output_file, index=False)
    logger.info(f"Final DataFrame saved to {output_file}")

if __name__ == "__main__":
    main_pipeline()
