Extract Turbidity Std Info

In [1]:
import os
import fitz  # PyMuPDF
import re
import pandas as pd

In [2]:
def abs_path(rel_path=None):
    """
    Resolve a SharePoint-style relative path to a local absolute path
    under the user's 'California Department of Water Resources' folder.
    """
    base = os.path.join(os.environ['USERPROFILE'], 'California Department of Water Resources')
    return os.path.normpath(os.path.join(base, rel_path)) if rel_path else base

def extract_instrument_id_from_blocks(doc):
    """
    Extract Instrument_ID from structured text matching RTM ### or RTM-###.
    """
    for page in doc:
        blocks = page.get_text('dict').get('blocks', [])
        for block in blocks:
            if block['type'] == 0:
                for line in block['lines']:
                    line_text = ' '.join(span['text'] for span in line['spans'])
                    match = re.search(r'\bRTM[\s\-]*\d+\b', line_text, flags=re.IGNORECASE)
                    if match:
                        return match.group(0).strip()
    return None

def extract_fields_from_pdf(pdf_path, is_post):
    """
    Extract Turbidity_Std, Turbidity_FNU_High, Instrument_ID, and Date
    depending on whether the form is POST or PRE.
    """
    try:
        doc = fitz.open(pdf_path)
        field_dict = {}

        for page in doc:
            for widget in page.widgets():
                if widget.field_name and widget.field_value:
                    field_dict[widget.field_name.strip()] = str(widget.field_value).strip()

        # Extract fields based on form type
        if is_post:
            turbidity_std = field_dict.get('Text56', None)
            turbidity_high = field_dict.get('TurbReadHigh', None)
            date = field_dict.get('Date', None)
        else:  # PRE
            turbidity_std = field_dict.get('Turbidity_Std_Lot', None)
            turbidity_high = field_dict.get('Turbidity_Std_High_Pre-Cal_Reading', None)
            date = field_dict.get('Calibration_Date', None)

        try:
            turbidity_high = float(turbidity_high)
        except (TypeError, ValueError):
            turbidity_high = None

        instrument_id = extract_instrument_id_from_blocks(doc)

        return turbidity_std, turbidity_high, instrument_id, date

    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None, None, None, None

def extract_from_folder(folder_path):
    """
    Extract data from all PRE/POST PDF files in the given folder.
    """
    records = []
    for file in os.listdir(folder_path):
        if file.lower().endswith('.pdf') and ('pre' in file.lower() or 'post' in file.lower()):
            file_path = os.path.join(folder_path, file)
            is_post = 'post' in file.lower()
            turb_std, turb_high, instrument_id, date = extract_fields_from_pdf(file_path, is_post)
            records.append({
                'Filename': file,
                'Instrument_ID': instrument_id,
                'Turbidity_Std': turb_std,
                'Turbidity_FNU_High': turb_high,
                'Date': date
            })

    return pd.DataFrame(records)


In [3]:
folder = abs_path('DWR Continuous Environmental Monitoring Program - Turbidity Standard Investigation')
df_results = extract_from_folder(folder)
df_results.to_csv(os.path.join(folder, 'turbidity_summary.csv'), index=False)

In [4]:
df_results

Unnamed: 0,Filename,Instrument_ID,Turbidity_Std,Turbidity_FNU_High,Date
0,POST_ANHA_100_2024-07-17.pdf,RTM 100,24E24012288,112.92,07/18/2024
1,POST_FRK_100_2024-06-11.pdf,RTM 100,22E24001726,123.88,06/12/2024
2,POST_GZL_78_2024-07-19.pdf,RTM 78,24E24012288,112.04,7/19/2024
3,POST_HON_79_2024-07-19.pdf,RTM 79,24E24012288,113.51,7/19/2024
4,POST_MSDA_55_2024-07-17.pdf,RTM-55,24E24012288,113.55,7/17/2024
5,POST_MSDB_65_2024-07-17.pdf,RTM-65,24E24012288,116.49,7/17/2024
6,POST_PPT_64_2024-07-10.pdf,RTM-64,24E24012288,110.3,07/11/2024
7,POST_RRIA_80_2024-06-05.pdf,RTM-80,23F24003259,123.15,06/05/2024
8,POST_RRIB_81_2024-06-05.pdf,RTM-81,23F24003259,124.03,06/05/2024
9,POST_RVB_78_2024-06-19.pdf,RTM-78,23F24003259,123.5,06/19/2024


In [26]:
def debug_instrument_lines(doc):
    for page in doc:
        blocks = page.get_text('dict').get('blocks', [])
        for block in blocks:
            if block['type'] == 0:  # text block
                for line in block['lines']:
                    line_text = ' '.join(span['text'] for span in line['spans'])
                    if 'rtm' in line_text.lower():
                        print('DEBUG >>', line_text)

doc = fitz.open(abs_path('DWR Continuous Environmental Monitoring Program - Turbidity Standard Investigation/POST_ANHA_100_2024-07-17.pdf'))
debug_instrument_lines(doc)

DEBUG >> California Department of Water Resources
DEBUG >> RTM 100
