# pipeline for one pdf

In [2]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract\tesseract.exe"
import pytesseract
from PIL import Image
from io import BytesIO
import numpy as np
import cv2

# Simple test image
img = np.ones((100, 300), dtype=np.uint8) * 255
cv2.putText(img, "TESSERACT TEST", (5, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0), 2)

# Run OCR
text = pytesseract.image_to_string(img)
print("OCR Output:", text)

OCR Output: TESSERACT 17



In [3]:
#Goal is to convert PDF file into image before converting into CSV file. 
import fitz  # PyMuPDF
from pathlib import Path

BASE_DIR = Path(r"C:\Users\immed\Documents\NYCDSA\DAPLINK.AI-Project")
PDF_PATH = BASE_DIR / "Roselle Dental Provider Productivity Jan 2022- Dec 2022.pdf"
IMG_DIR = BASE_DIR / 'PDF Images'

IMG_DIR.mkdir(exist_ok=True)

doc = fitz.open(PDF_PATH)

for i, page in enumerate(doc):
    pix = page.get_pixmap(dpi=300)
    img_path = IMG_DIR / f"page_{i+1:03}.png"
    pix.save(str(img_path))
    print(f"Saved {img_path}")

print("PDF pages converted to images.")

Saved C:\Users\immed\Documents\NYCDSA\DAPLINK.AI-Project\PDF Images\page_001.png
PDF pages converted to images.


In [24]:
import os
import cv2
import pytesseract
import re
import pandas as pd
from pathlib import Path

BASE_DIR = Path(r"C:\Users\immed\Documents\NYCDSA\DAPLINK.AI-Project")
IMG_DIR = BASE_DIR / 'PDF Images'
OUTPUT_CSV = BASE_DIR / 'Productivity_full.csv'

# Match provider lines
provider_decl = re.compile(r"^(.+?),\s*(DDS|RDH)$")

# Match service + 5 numbers
row_pattern = re.compile(
    r"^(.+?,\s*(DDS|RDH)[^$]*?)\s+(\d+)\s+\$?([\d,.]+)\s+\$?([\d,.]+)\s+\$?([\d,.]+)\s+\(?\$?([\d,.]+)\)?"
)

records = []
last_provider = None

for img_file in sorted(os.listdir(IMG_DIR)):
    if not img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        continue

    img_path = os.path.join(IMG_DIR, img_file)
    img = cv2.imread(img_path)
    if img is None:
        print(f"Warning: Cannot load image: {img_file}")
        continue

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    full_text = pytesseract.image_to_string(gray, lang='eng', config='--psm 6')
    lines = [ln.strip() for ln in full_text.splitlines() if ln.strip()]
    print(f"\n--- OCR lines from {img_file} ---")
for i, ln in enumerate(lines):
    print(f"{i+1:02}: {ln}")
    current_provider = last_provider or 'Unknown'

for ln in lines:
    if any(skip in ln for skip in ['TIME', 'DATE', 'Totals', 'Page', 'Note', 'PROVIDER PRODUCTIVITY']):
        continue

    # Extract numeric-like tokens using regex (allows for OCR errors)
    nums = re.findall(r"[-\($]?\$?\d[\d,]*\.?\d{0,2}[\)$]?", ln)

    if len(nums) < 3:
        print(f"Not enough usable numeric fields: {ln}")
        continue

    # Attempt to get provider name using the first numeric match
    provider_split = re.split(r"\$?\d", ln, maxsplit=1)
    provider_raw = provider_split[0].strip()

    if not provider_raw or provider_raw.lower().startswith("totals"):
        continue

    # Pad with '0.00' if we got fewer than 5 numbers
    while len(nums) < 5:
        nums.append('0.00')

    try:
        patients = int(re.sub(r"[^\d]", "", nums[0]) or 0)
        avg_visit = float(re.sub(r"[^\d.]", "", nums[1]) or 0)
        prod = float(re.sub(r"[^\d.]", "", nums[2]) or 0)
        coll = float(re.sub(r"[^\d.]", "", nums[3]) or 0)
        adj = float(re.sub(r"[^\d.]", "", nums[4]) or 0)

        records.append({
            'Provider': provider_raw,
            'Patients Seen': patients,
            'Avg Visit Fee': avg_visit,
            'Production': prod,
            'Collection': coll,
            'Adjustments': adj
        })

    except Exception as e:
        print(f"Final parse error on line: {ln}\n   Reason: {e}")
        continue

df = pd.DataFrame(records)

def split_provider_field(provider):
    if '-' in provider:
        parts = provider.rsplit('-', 1)
        return pd.Series([parts[0].strip(), parts[1].strip()])
    else:
        return pd.Series([provider.strip(), ""])

df[['Provider Name', 'Service Role']] = df['Provider'].apply(split_provider_field)

# Optional: Drop original 'Provider' and reorder columns
df = df[['Provider Name', 'Service Role', 'Patients Seen', 'Avg Visit Fee', 'Production', 'Collection', 'Adjustments']]
        
df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
print(f'Done! {len(df)} rows written to {OUTPUT_CSV}')



--- OCR lines from page_001.png ---
01: TIME 2:30 PM Roselle Dental Center DATE 5/29/2025
02: PROVIDER PRODUCTIVITY
03: From EOD: Jan 03, 22 (01/03/22) To EOD: Dec 30, 22 (12/30/22)
04: S Patients Avg. Ts ees
05: Seen Visit Production Collection Adjustments
06: Harvey Seybold, DDS - Dentist 2152 $170.51 $366,940.53 $644,938.06 ($68,140.36)
07: Emily Hawkins RDH - Hygienist 728 $156.95 $114,256.95 $402.70 ($62.00)
08: Farah Rahman RDH- Hygienist 1325 $139.81 $185,252.20 $0.00 $0.00
09: Hyg Hyg - Hygienist 239 $139.37 $33,310.40 7 55-3 2} $0.00 $0.00
10: IC - Dentist 100 $1,338.51 $133,851.00 $126,709.34 ($20,040.06)
11: Jacqueline Hassenplug, DMI- Dentist 2103 $197.53 $415,411.10 $353,429.40 ($66,030.19)
12: Michele Chiafulio-Zasada, RI- Hygienist 83 $142.71 $11,844.60 $0.00 $0.00
13: Peter S. Chang, DDS - Dentist 57 $310.11 $17,676.10 ... ;  $$9,175.42 ($8,424.32)
14: Roselle Dental Center - Office Staff 0 $0.00 “te $0.00 $0.00
15: Yzabelle Tud - Hygienist £2 0 $0.00 “$0.00 $0.00
16: 

# loop all pdf