# pipeline for one pdf

In [29]:
#Goal is to convert PDF file into image before converting into CSV file. 
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract\tesseract.exe"
import pytesseract
from PIL import Image
from io import BytesIO
import numpy as np
import cv2
import fitz  # PyMuPDF
from pathlib import Path
import os
import re
import pandas as pd

BASE_DIR = Path(r"C:\Users\immed\Documents\NYCDSA\DAPLINK.AI-Project")
#PDF_PATH = BASE_DIR / "Roselle Dental Provider Productivity Jan 2022- Dec 2022.pdf"
#PDF_PATH = BASE_DIR / "Roselle Dental Provider Productivity Jan 2023- Dec 2023.pdf"
#PDF_PATH = BASE_DIR / "Roselle Dental Provide Productivity Jan 2024- Dec 2024.pdf"
PDF_PATH = BASE_DIR / "Roselle Dental Provider Productivity EOD Jan 2025- May2025.pdf"
IMG_DIR = BASE_DIR / 'PDF Images'

IMG_DIR.mkdir(exist_ok=True)

doc = fitz.open(PDF_PATH)

for i, page in enumerate(doc):
    pix = page.get_pixmap(dpi=300)
    img_path = IMG_DIR / f"page_{i+1:03}.png"
    pix.save(str(img_path))
    print(f"Saved {img_path}")

print("PDF pages converted to images.")

Saved C:\Users\immed\Documents\NYCDSA\DAPLINK.AI-Project\PDF Images\page_001.png
PDF pages converted to images.


In [30]:
#OUTPUT_CSV = BASE_DIR / 'Productivity_2022.csv'
#OUTPUT_CSV = BASE_DIR / 'Productivity_2023.csv'
#OUTPUT_CSV = BASE_DIR / 'Productivity_2024.csv'
OUTPUT_CSV = BASE_DIR / 'Productivity_2025.csv'

# Match provider lines
provider_decl = re.compile(r"^(.+?),\s*(DDS|RDH)$")

# Match service + 5 numbers
row_pattern = re.compile(
    r"^(.+?,\s*(DDS|RDH)[^$]*?)\s+(\d+)\s+\$?([\d,.]+)\s+\$?([\d,.]+)\s+\$?([\d,.]+)\s+\(?\$?([\d,.]+)\)?"
)

records = []
last_provider = None

for img_file in sorted(os.listdir(IMG_DIR)):
    if not img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    img_path = os.path.join(IMG_DIR, img_file)
    img = cv2.imread(img_path)
    if img is None:
        print(f"Warning: Cannot load image: {img_file}")
        continue

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    full_text = pytesseract.image_to_string(gray, lang='eng', config='--psm 6')
    lines = [ln.strip() for ln in full_text.splitlines() if ln.strip()]
    print(f"\n--- OCR lines from {img_file} ---")
for i, ln in enumerate(lines):
    print(f"{i+1:02}: {ln}")
    current_provider = last_provider or 'Unknown'

for ln in lines:
    if any(skip in ln for skip in ['TIME', 'DATE', 'Totals', 'Page', 'Note', 'PROVIDER PRODUCTIVITY']):
        continue

    # Extract numeric-like tokens using regex (allows for OCR errors)
    nums = re.findall(r"[-\($]?\$?\d[\d,]*\.?\d{0,2}[\)$]?", ln)

    if len(nums) < 3:
        print(f"Not enough usable numeric fields: {ln}")
        continue

    # Attempt to get provider name using the first numeric match
    provider_split = re.split(r"\$?\d", ln, maxsplit=1)
    provider_raw = provider_split[0].strip()

    if not provider_raw or provider_raw.lower().startswith("totals"):
        continue

    # Pad with '0.00' if we got fewer than 5 numbers
    while len(nums) < 5:
        nums.append('0.00')

    try:
        patients = int(re.sub(r"[^\d]", "", nums[0]) or 0)
        avg_visit = float(re.sub(r"[^\d.]", "", nums[1]) or 0)
        prod = float(re.sub(r"[^\d.]", "", nums[2]) or 0)
        coll = float(re.sub(r"[^\d.]", "", nums[3]) or 0)
        adj = float(re.sub(r"[^\d.]", "", nums[4]) or 0)

        records.append({
            'Provider': provider_raw,
            'Patients Seen': patients,
            'Avg Visit Fee': avg_visit,
            'Production': prod,
            'Collection': coll,
            'Adjustments': adj
        })

    except Exception as e:
        print(f"Final parse error on line: {ln}\n   Reason: {e}")
        continue

df = pd.DataFrame(records)

def split_provider_field(provider):
    if '-' in provider:
        parts = provider.rsplit('-', 1)
        return pd.Series([parts[0].strip(), parts[1].strip()])
    else:
        return pd.Series([provider.strip(), ""])

df[['Provider Name', 'Service Role']] = df['Provider'].apply(split_provider_field)

# Optional: Drop original 'Provider' and reorder columns
df = df[['Provider Name', 'Service Role', 'Patients Seen', 'Avg Visit Fee', 'Production', 'Collection', 'Adjustments']]
        
df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
print(f'Done! {len(df)} rows written to {OUTPUT_CSV}')



--- OCR lines from page_001.png ---
01: TIME 2:37 PM 7 Roselle Dental Center DATE 5/29/2025 ~~
02: PROVIDER PRODUCTIVITY __
03: From EOD: Jan 03, 25 (01/03/25) To EOD: May 28; 25 (05/28/25)
04: Patients Avg.
05: Seen Visit Production Collection Adjustments
06: Harvey Seybold, DDS - Dentist ; 1025 $196.90 $201,825.07 $279,275.78 ($46,806.55)
07: Farah Rahman RDH - Hygienist : 467 $160.10 $74,764.93: » “> $0.00 $0.00
08: Jacqueline Hassenplug, DMI- Dentist 416 $290.88 $121,006.05 $95,654.08 ($18,020.08)
09: Michele Chiafulio-Zasada, RI- Hygienist 36 $159.16 $5,729;:65«: isi diss) $0.00 $0.00
10: Peter S. Chang, DDS - Dentist : 7 $343.84 $2,406.85 — $1,443.85 ($91.40)
11: Yzabelle Tud - Hygienist 408 $166.28 $67,843.85 “$0.00 $0.00
12: Totals 1596 $296.73 $473,576.40 $376,373.71 ($64,918.03)
13: ““ Note - Total patients seen _ is the total for the system, the total for each provider may include patiénts seen by multiple providers. |
14: “™" Note - Adjustments Total _ is the total of the 