# pipeline for one pdf

In [5]:
import os
import cv2
import pytesseract
import fitz
import re
import pandas as pd

PDF_PATH   = 'procedures.pdf'
IMG_DIR    = 'tmp_images'
OUTPUT_CSV = 'procedures_full.csv'

# 1. PDF

records = []
last_provider = None

row_pattern     = re.compile(r'^([A-Za-z0-9]{3,7})\s+(.+?)\s+(\d+)\s+\$?([\d,]+(?:\.\d{2})?)$')
summary_pattern = re.compile(r"^(.+?)\s+'s Total")
provider_decl   = re.compile(r"^(.+?),\s*(DDS|RDH)$")

for img_file in sorted(os.listdir(IMG_DIR)):
    # —— 只处理 .png/.jpg 文件，跳过其它
    if not img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    img_path = os.path.join(IMG_DIR, img_file)
    img      = cv2.imread(img_path)
    if img is None:
        print(f"Warning: 无法读取 {img_file}, 已跳过")
        continue

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    full_text = pytesseract.image_to_string(gray, lang='eng', config='--psm 6')
    lines     = [ln.strip() for ln in full_text.splitlines() if ln.strip()]

    # 跨页继承
    current_provider = last_provider or 'Unknown'

    for ln in lines:
        # 新的 provider 声明行（页面顶端）
        if provider_decl.match(ln):
            current_provider = ln
            last_provider    = ln
            continue

        # 汇总行
        m_sum = summary_pattern.match(ln)
        if m_sum:
            current_provider = m_sum.group(1).strip()
            last_provider    = current_provider
            continue

        # 表格行
        m_row = row_pattern.match(ln)
        if m_row:
            code, svc, num, amt = m_row.groups()
            records.append({
                'Provider': current_provider,
                'Code':     code,
                'Service':  svc,
                'Number':   int(num),
                'Amount':   float(amt.replace(',',''))
            })

# 输出
df = pd.DataFrame(records)
df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
print(f'Done! 共提取 {len(df)} 条记录 → {OUTPUT_CSV}')


Done! 共提取 428 条记录 → procedures_full.csv


# loop all pdf

In [13]:
import os
import re
import fitz           # PyMuPDF: pip install pymupdf
import cv2            # pip install opencv-python
import pytesseract    # pip install pytesseract, and make sure Tesseract OCR is installed on your system
import pandas as pd

def parse_procedures(pdf_path):
    """
    Parse a single year’s PDF of dental procedures into a DataFrame.
    Adds a 'Year' column extracted from the filename.
    """
    # Extract the year from the filename, e.g. "procedures_2022.pdf" → "2022"
    year_match = re.search(r'(\d{4})', os.path.basename(pdf_path))
    year = year_match.group(1) if year_match else 'Unknown'

    # Create a temporary directory for images
    tmp_dir = f"tmp_{year}"
    os.makedirs(tmp_dir, exist_ok=True)

    # 1) Convert PDF pages to 300 dpi PNG images
    doc = fitz.open(pdf_path)
    zoom = 300 / 72
    matrix  = fitz.Matrix(zoom, zoom)
    for i in range(doc.page_count):
        pix = doc.load_page(i).get_pixmap(matrix=matrix, alpha=False)
        out_path = os.path.join(tmp_dir, f"page_{i+1:02d}.png")
        pix.save(out_path)

    # 2) Prepare regex patterns
    row_pattern     = re.compile(r'^([A-Za-z0-9]{3,7})\s+(.+?)\s+(\d+)\s+\$?([\d,]+(?:\.\d{2})?)$')
    summary_pattern = re.compile(r"^(.+?)\s+'s Total")
    provider_decl   = re.compile(r"^(.+?),\s*(DDS|RDH)$")

    records = []
    last_provider = None

    # 3) OCR each image, segment by provider, extract rows
    for img_file in sorted(os.listdir(tmp_dir)):
        if not img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        img = cv2.imread(os.path.join(tmp_dir, img_file))
        if img is None:
            print(f"Warning: Could not read image {img_file}, skipping")
            continue

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray, lang='eng', config='--psm 6')
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]

        current_provider = last_provider

        for ln in lines:
            # 3.1 If this line declares a provider (e.g. "Harvey Seybold, DDS")
            if provider_decl.match(ln):
                current_provider = ln
                last_provider    = ln
                continue

            # 3.2 If this line is a summary row (e.g. "Harvey Seybold, DDS 's Total")
            m_sum = summary_pattern.match(ln)
            if m_sum:
                current_provider = m_sum.group(1).strip()
                last_provider    = current_provider
                continue

            # 3.3 If this line matches a procedure row, extract its fields
            m_row = row_pattern.match(ln)
            if m_row:
                code, svc, num, amt = m_row.groups()
                records.append({
                    'Year':     year,
                    'Provider': current_provider,
                    'Code':     code,
                    'Service':  svc,
                    'Number':   int(num),
                    'Amount':   float(amt.replace(',', ''))
                })

    # Return a DataFrame for this year
    return pd.DataFrame(records)


def main():
    years = ['2022', '2023', '2024', '2025']
    all_dfs = []

    for y in years:
        pdf_file = f"procedures_{y}.pdf"
        if not os.path.exists(pdf_file):
            print(f"{pdf_file} not found, skipping")
            continue

        df_year = parse_procedures(pdf_file)
        print(f"{y}: extracted {len(df_year)} rows")
        all_dfs.append(df_year)

    # Combine all years and save to CSV
    if all_dfs:
        df_all = pd.concat(all_dfs, ignore_index=True)
        out_csv = 'procedures_2022_2025_all.csv'
        df_all.to_csv(out_csv, index=False, encoding='utf-8-sig')
        print(f"All years combined: {len(df_all)} rows → {out_csv}")


if __name__ == "__main__":
    main()


2022: extracted 428 rows
2023: extracted 408 rows
2024: extracted 399 rows
2025: extracted 236 rows
All years combined: 1471 rows → procedures_2022_2025_all.csv
