In [0]:
#%run ./init

In [0]:
import os
import re
import cv2
import pytesseract
import pandas as pd
import logging
import json

# --------------------------------------------------------
# 1) Minimal Logger Configuration
# --------------------------------------------------------
logger = logging.getLogger("WellJobInfoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# --------------------------------------------------------
# 2) Read Image from DBFS/Local Path
# --------------------------------------------------------
def read_cropped_section_image(section_path):
    local_path = section_path
    if local_path.startswith("dbfs:"):
        local_path = local_path.replace("dbfs:", "")
    if local_path.startswith("/mnt/"):
        local_path = "/dbfs" + local_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise FileNotFoundError(f"OpenCV failed to load image: {local_path}")
    return img

# --------------------------------------------------------
# 3) OCR Extraction (Minimal Processing)
# --------------------------------------------------------
def perform_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Use psm 6: assume a uniform block of text
    text = pytesseract.image_to_string(gray, config='--psm 6')
    return text

# --------------------------------------------------------
# 4) Expected Keys Extraction from Combined OCR Text
# --------------------------------------------------------
def extract_key_value_from_text(text, expected_keys):
    """
    Combines all OCR text lines into a single string and, using the expected keys
    (in order), extracts each value as the text between the current key and the next key.
    If a key is not found, an empty string is returned.
    """
    # Combine all non-empty lines into one string.
    combined = " ".join(line.strip() for line in text.splitlines() if line.strip())
    # Normalize whitespace.
    combined = re.sub(r'\s+', ' ', combined)
    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = re.escape(key) + r'\s*:\s*(.*?)\s*(?=' + re.escape(next_key) + r'\s*:)'
        else:
            pattern = re.escape(key) + r'\s*:\s*(.*)$'
        match = re.search(pattern, combined, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            result[key] = value
        else:
            result[key] = ""
    return result

# --------------------------------------------------------
# 5) Main Pipeline
# --------------------------------------------------------
def main_pipeline():
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_2.png"
    try:
        img = read_cropped_section_image(section_path)
        logger.info("Image loaded successfully.")
    except FileNotFoundError as e:
        logger.error(e)
        return

    ocr_text = perform_ocr(img)
    logger.info("OCR extraction complete.")
    logger.info(f"OCR Text:\n{ocr_text}")

    # Define the exact expected keys (in the desired order)
    expected_keys = [
        "Well Name",
        "Job Name",
        "Supervisor(s)",
        "Field",
        "Sec/Twn/Rng",
        "Phone",
        "AFE #",
        "API #",
        "Email",
        "Contractor",
        "Elevation",
        "RKB",
        "Spud Date",
        "Days from Spud",
        "Days on Loc",
        "MD/TVD",
        "24 Hr Footage",
        "Present Operations",
        "Activity Planned"
    ]

    extracted = extract_key_value_from_text(ocr_text, expected_keys)
    final_dict = {"WELL/JOB INFORMATION": extracted}
    print(json.dumps(final_dict, indent=4))

    df_final = pd.DataFrame(list(extracted.items()), columns=["Key", "Value"])
    try:
        display(df_final)
    except NameError:
        print(df_final)
    
    output_folder = "dbfs:/mnt/mini-proj-dd/final_ocr_results"
    local_output_folder = output_folder.replace("dbfs:", "/dbfs")
    os.makedirs(local_output_folder, exist_ok=True)
    output_file = os.path.join(local_output_folder, "page_1_section_2_ocr.csv")
    df_final.to_csv(output_file, index=False)
    logger.info(f"Final DataFrame saved to {output_file}")

if __name__ == "__main__":
    main_pipeline()


INFO: Image loaded successfully.
INFO: OCR extraction complete.
INFO: OCR Text:
Well Name: Ross Fee 4371-31-7-15 MH Job Name: Drilling Supervisor(s): CHAD MILLER / ED COOLEY
Field: XBE Sec/Twn/Rng: 31, 43N, 71W Phone: 307-315-1908

AFE #: 240098 API #: 49-005-78911 Email: cyclone39@aec-denver.com
Contractor: Elevation: 4913.5 RKB: 27.5
Spud Date: 6/4/2024 Days from Spud: 7.67 Days on Loc: 34

MD/TVD: 20537 FT/10719 FT 24 Hr Footage: 3068
Present Operations: DRILLING LATERAL @ 20,537'.
Activity Planned: DRILL LATERAL SECTION TO PLANNED TD @ ~21,226', PUMP TD SWEEPS & CHC, SOOH & L/D DRILL PIPE.



{
    "WELL/JOB INFORMATION": {
        "Well Name": "Ross Fee 4371-31-7-15 MH",
        "Job Name": "Drilling",
        "Supervisor(s)": "CHAD MILLER / ED COOLEY",
        "Field": "XBE",
        "Sec/Twn/Rng": "31, 43N, 71W",
        "Phone": "307-315-1908",
        "AFE #": "240098",
        "API #": "49-005-78911",
        "Email": "cyclone39@aec-denver.com",
        "Contractor": "",
        "Elevation": "4913.5",
        "RKB": "27.5",
        "Spud Date": "6/4/2024",
        "Days from Spud": "7.67",
        "Days on Loc": "34",
        "MD/TVD": "20537 FT/10719 FT",
        "24 Hr Footage": "3068",
        "Present Operations": "DRILLING LATERAL @ 20,537'.",
        "Activity Planned": "DRILL LATERAL SECTION TO PLANNED TD @ ~21,226', PUMP TD SWEEPS & CHC, SOOH & L/D DRILL PIPE."
    }
}


Key,Value
Well Name,Ross Fee 4371-31-7-15 MH
Job Name,Drilling
Supervisor(s),CHAD MILLER / ED COOLEY
Field,XBE
Sec/Twn/Rng,"31, 43N, 71W"
Phone,307-315-1908
AFE #,240098
API #,49-005-78911
Email,cyclone39@aec-denver.com
Contractor,


INFO: Final DataFrame saved to /dbfs/mnt/mini-proj-dd/final_ocr_results/page_1_section_2_ocr.csv
INFO:WellJobInfoExtractor:Final DataFrame saved to /dbfs/mnt/mini-proj-dd/final_ocr_results/page_1_section_2_ocr.csv


In [0]:
import os
import re
import cv2
import pytesseract
import pandas as pd
import logging
import json

# --------------------------------------------------------
# 1) Minimal Logger Configuration
# --------------------------------------------------------
logger = logging.getLogger("WellJobInfoExtractor")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
    logger.addHandler(handler)

# --------------------------------------------------------
# 2) Read Image from DBFS/Local Path
# --------------------------------------------------------
def read_cropped_section_image(section_path):
    local_path = section_path
    if local_path.startswith("dbfs:"):
        local_path = local_path.replace("dbfs:", "")
    if local_path.startswith("/mnt/"):
        local_path = "/dbfs" + local_path
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found: {local_path}")
    img = cv2.imread(local_path)
    if img is None:
        raise FileNotFoundError(f"OpenCV failed to load image: {local_path}")
    return img

# --------------------------------------------------------
# 3) OCR Extraction (Minimal Processing)
# --------------------------------------------------------
def perform_ocr(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray, config='--psm 6')
    return text

# --------------------------------------------------------
# 4) Extract BOP Information
# --------------------------------------------------------
def extract_bop_info(text):
    pattern = {
        "Last BOP Test Date": r"Last BOP Test Date\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Last BOP Drill": r"Last BOP Drill\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})",
        "Next BOP Test": r"Next BOP Test\s*:\s*(\d{1,2}/\d{1,2}/\d{2,4})"
    }
    result = {}
    for key, regex in pattern.items():
        match = re.search(regex, text, re.IGNORECASE)
        result[key] = match.group(1) if match else ""
    return result

# --------------------------------------------------------
# 5) Main Pipeline
# --------------------------------------------------------
def main_pipeline():
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_8.png"
    try:
        img = read_cropped_section_image(section_path)
        logger.info("Image loaded successfully.")
    except FileNotFoundError as e:
        logger.error(e)
        return

    ocr_text = perform_ocr(img)
    logger.info("OCR extraction complete.")
    logger.info(f"OCR Text:\n{ocr_text}")

    extracted_bop = extract_bop_info(ocr_text)
    final_dict = {"BOP": extracted_bop}
    print(json.dumps(final_dict, indent=4))

    df_final = pd.DataFrame(list(extracted_bop.items()), columns=["Key", "Value"])
    try:
        display(df_final)
    except NameError:
        print(df_final)
    
    output_folder = "dbfs:/mnt/mini-proj-dd/final_ocr_results"
    local_output_folder = output_folder.replace("dbfs:", "/dbfs")
    os.makedirs(local_output_folder, exist_ok=True)
    output_file = os.path.join(local_output_folder, "page_1_section_8_bop_ocr.csv")
    df_final.to_csv(output_file, index=False)
    logger.info(f"Final DataFrame saved to {output_file}")

if __name__ == "__main__":
    main_pipeline()


INFO: Image loaded successfully.
INFO:WellJobInfoExtractor:Image loaded successfully.
INFO: OCR extraction complete.
INFO:WellJobInfoExtractor:OCR extraction complete.
INFO: OCR Text:
Last BOP Test Date: 6/30/24 Last BOP Drill: 7/3/2024 Next BOP Test: 7/25/24

INFO:WellJobInfoExtractor:OCR Text:
Last BOP Test Date: 6/30/24 Last BOP Drill: 7/3/2024 Next BOP Test: 7/25/24



{
    "BOP": {
        "Last BOP Test Date": "6/30/24",
        "Last BOP Drill": "7/3/2024",
        "Next BOP Test": "7/25/24"
    }
}


Key,Value
Last BOP Test Date,6/30/24
Last BOP Drill,7/3/2024
Next BOP Test,7/25/24


INFO: Final DataFrame saved to /dbfs/mnt/mini-proj-dd/final_ocr_results/page_1_section_8_bop_ocr.csv
INFO:WellJobInfoExtractor:Final DataFrame saved to /dbfs/mnt/mini-proj-dd/final_ocr_results/page_1_section_8_bop_ocr.csv


In [0]:
import re
import json
import pandas as pd
import os

def extract_key_value_from_text(text, expected_keys):
    """
    Extracts key-value pairs from the text, ensuring that keys with empty values
    are correctly detected instead of capturing the next key as their value.
    """
    combined = " ".join(line.strip() for line in text.splitlines() if line.strip())
    combined = re.sub(r'\s+', ' ', combined)  # Normalize whitespace

    result = {}
    for i, key in enumerate(expected_keys):
        if i < len(expected_keys) - 1:
            next_key = expected_keys[i+1]
            pattern = rf'{re.escape(key)}\s*:\s*(.*?)(?=\s*{re.escape(next_key)}\s*:|$)'
        else:
            pattern = rf'{re.escape(key)}\s*:\s*(.*)'

        match = re.search(pattern, combined, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            result[key] = value if value else None  # Assign None for empty values
        else:
            result[key] = None  # Explicitly mark missing values

    return result

# --------------------------------------------------------
# 5) Main Pipeline for Cost Data Extraction
# --------------------------------------------------------
def main_pipeline():
    section_path = "dbfs:/mnt/mini-proj-dd/cropped_sections/page_1_section_13.png"
    
    try:
        img = read_cropped_section_image(section_path)
        logger.info("Image loaded successfully.")
    except FileNotFoundError as e:
        logger.error(e)
        return

    ocr_text = perform_ocr(img)
    logger.info("OCR extraction complete.")
    logger.info(f"OCR Text:\n{ocr_text}")

    expected_keys = [
        "Drilling AFE Amount",
        "Daily Drilling Cost",
        "Cumulative Drilling Cost",
        "Cumulative Well Cost",
        "Daily Mud Cost",
        "Cumulative Mud Cost"
    ]

    extracted = extract_key_value_from_text(ocr_text, expected_keys)
    final_dict = {"COST DATA": extracted}
    logger.info(json.dumps(final_dict, indent=4))
    print(json.dumps(final_dict, indent=4))

    df_final = pd.DataFrame(list(extracted.items()), columns=["Key", "Value"])
    try:
        display(df_final)
    except NameError:
        print(df_final)
    
    output_folder = "/dbfs/mnt/mini-proj-dd/final_ocr_results"
    os.makedirs(output_folder, exist_ok=True)
    output_file = os.path.join(output_folder, "page_1_section_cost_data_ocr.csv")
    df_final.to_csv(output_file, index=False)
    logger.info(f"Final DataFrame saved to {output_file}")

if __name__ == "__main__":
    main_pipeline()


INFO: Image loaded successfully.
INFO:WellJobInfoExtractor:Image loaded successfully.
INFO: OCR extraction complete.
INFO:WellJobInfoExtractor:OCR extraction complete.
INFO: OCR Text:
COST DATA
Drilling AFE Amount: Daily Drilling Cost: $167,006.63 Cumulative Drilling Cost: $1,747,745 Cumulative Well Cost: $1,914,752
Daily Mud Cost: $54,185.80 Cumulative Mud Cost: $299,370.66

INFO:WellJobInfoExtractor:OCR Text:
COST DATA
Drilling AFE Amount: Daily Drilling Cost: $167,006.63 Cumulative Drilling Cost: $1,747,745 Cumulative Well Cost: $1,914,752
Daily Mud Cost: $54,185.80 Cumulative Mud Cost: $299,370.66

INFO: {
    "COST DATA": {
        "Drilling AFE Amount": null,
        "Daily Drilling Cost": "$167,006.63",
        "Cumulative Drilling Cost": "$1,747,745",
        "Cumulative Well Cost": "$1,914,752",
        "Daily Mud Cost": "$54,185.80",
        "Cumulative Mud Cost": "$299,370.66"
    }
}
INFO:WellJobInfoExtractor:{
    "COST DATA": {
        "Drilling AFE Amount": null,
       

{
    "COST DATA": {
        "Drilling AFE Amount": null,
        "Daily Drilling Cost": "$167,006.63",
        "Cumulative Drilling Cost": "$1,747,745",
        "Cumulative Well Cost": "$1,914,752",
        "Daily Mud Cost": "$54,185.80",
        "Cumulative Mud Cost": "$299,370.66"
    }
}


Key,Value
Drilling AFE Amount,
Daily Drilling Cost,"$167,006.63"
Cumulative Drilling Cost,"$1,747,745"
Cumulative Well Cost,"$1,914,752"
Daily Mud Cost,"$54,185.80"
Cumulative Mud Cost,"$299,370.66"


INFO: Final DataFrame saved to /dbfs/mnt/mini-proj-dd/final_ocr_results/page_1_section_cost_data_ocr.csv
INFO:WellJobInfoExtractor:Final DataFrame saved to /dbfs/mnt/mini-proj-dd/final_ocr_results/page_1_section_cost_data_ocr.csv
