In [4]:
import os
import re
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader, PdfWriter

# ------------------------------------------------------------------
# 1. Utility: List PDFs in a Folder
# ------------------------------------------------------------------
def list_pdf_files_in_folder(folder_path):
    """
    Returns a list of PDF file names in the specified folder,
    filtering by .pdf files (case-insensitive).
    """
    pdf_files = []
    for entry in os.listdir(folder_path):
        if entry.lower().endswith(".pdf"):
            full_path = os.path.join(folder_path, entry)
            if os.path.isfile(full_path):
                pdf_files.append(entry)
    return pdf_files

# ------------------------------------------------------------------
# 2. Vendor Identification and Invoice Number Extraction
# ------------------------------------------------------------------
def identify_vendor_from_text(page_text):
    """
    Checks if one of the four main vendors (HESCO, WORLD, Mayer, GRAYBAR)
    is present in the text. If none, returns 'other'.
    Case-sensitive check.
    """
    main_vendors = ["HESCO", "WORLD", "Mayer", "GRAYBAR", "DIXIE CONSTRUCTION", "Hilti", "Magid Glove",
                    "Thryv", "WHITE CAP", "B&D CONCRETE", "asesix", "CONVERGINT", "Herc", "HOPE ENGRAVING",
                    "LOGOMAN", "Metro Trailer", "MIS Solutions", "Workright", "Eckart", "United Rentals"]
    for v in main_vendors:
        if v in page_text:
            return v
    return "other"

def extract_invoice_number(vendor, full_text):
    """
    Given the vendor, extract the invoice number using vendor-specific patterns.

    Returns None if not found.
    """
    hesco_pattern = re.compile(r'(?:S|\$)100\d{6}\.\d{3}')
    world_pattern = re.compile(r'(?:S|\$)\d{9}\.\d{3}')
    mayer_pattern = re.compile(r'Invoice #:\s*(\d+)')
    graybar_pattern = re.compile(r'93[34]\d{7}')
    dixie_pattern = re.compile(r'(50|51|52|89)\d{5}-\d{2}')
    hilti_pattern = re.compile(r'46\d{8}')
    magid_pattern = re.compile(r'5\d{6}')
    thryv_pattern = re.compile(r'(3|4|5|6|7)\d{6}')
    white_cap_pattern = re.compile(r'(1|5)\d{10}')
    b_d_concrete_pattern = re.compile(r'(8[3-9]|9[0-9])\d{3}')
    basesix_pattern = re.compile(r'Invoice #\s*(\d{5,9})')
    convergint_pattern = re.compile(r'401[A-Z]{2}(\d{5}|\d{4}{A-Z])')
    herc_pattern = re.compile(r'3[3-9]\d{6}-\d{3}')
    hope_pattern = re.compile(r'(?<=Invoice #)[^0-9]*?(\d{3})')
    logoman_pattern = re.compile(r'(?<=LOGOMAN MARKETING GROUP )\d{5}')
    metro_pattern = re.compile(r'RI\d{7}')
    mis_pattern = re.compile(r'CW-\d{6}')
    workright_pattern = re.compile(r'2[4-5]-\d{4}')
    eckart_pattern = re.compile(r'(?:S|\$)10\d{7}.\d{3}')
    united_rentals_pattern = re.compile(r'#\s*\d{9}-\d{3}')

    if vendor == "HESCO":
        match = hesco_pattern.search(full_text)
        if match:
            invoice_num = match.group(0)
            if invoice_num.startswith('$'):
                invoice_num = 'S' + invoice_num[1:]
            return invoice_num

    elif vendor == "WORLD":
        match = world_pattern.search(full_text)
        if match:
            invoice_num = match.group(0)
            if invoice_num.startswith('$'):
                invoice_num = 'S' + invoice_num[1:]
            return invoice_num

    elif vendor == "Mayer":
        match = mayer_pattern.search(full_text)
        if match:
            return match.group(1)

    elif vendor == "GRAYBAR":
        match = graybar_pattern.search(full_text)
        if match:
            return match.group(0)
    
    elif vendor == "DIXIE CONSTRUCTION":
        match = dixie_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == "Hilti":
        match = hilti_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == "Magid Glove":
        match = magid_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == "Thryv":
        match = thryv_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == "WHITE CAP":
        match = white_cap_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == "B&D CONCRETE":
        match = b_d_concrete_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == "asesix":
        match = basesix_pattern.search(full_text)
        if match:
            return match.group(1)
        
    elif vendor == 'CONVERGINT':
        match = convergint_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == 'Herc':
        match = herc_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == 'HOPE ENGRAVING':
        match = hope_pattern.search(full_text)
        if match:
            return match.group(1)
        
    elif vendor == 'LOGOMAN':
        match = logoman_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == 'Metro Trailer':
        match = metro_pattern.search(full_text)
        if match:
            return
    
    elif vendor == 'MIS Solutions':
        match = mis_pattern.search(full_text)
        if match:
            return match.group(0)
    
    elif vendor == 'Workright':
        match = workright_pattern.search(full_text)
        if match:
            return match.group(0)
        
    elif vendor == 'Eckart':
        match = eckart_pattern.search(full_text)
        if match:
            invoice_num = match.group(0)
            if invoice_num.startswith('$'):
                invoice_num = 'S' + invoice_num[1:]
            return invoice_num
    
    elif vendor == 'United Rentals':
        match = united_rentals_pattern.search(full_text)
        if match:
            return match.group(0)

    return None

# ------------------------------------------------------------------
# 3. Split Logic Using "Page x of X"
# ------------------------------------------------------------------
def parse_page_of_pattern(page_text):
    """
    Looks for 'Page x of X' in the OCR text, case-sensitive.
    If found, returns (x, X) as integers. Otherwise, returns None.
    """
    pattern = re.compile(r"Page\s+(\d+)\s+of\s+(\d+)")
    match = pattern.search(page_text)
    if match:
        current_page = int(match.group(1))
        total_pages = int(match.group(2))
        return (current_page, total_pages)
    return None

def split_invoices(pdf_path):
    """
    1) Convert each page to an image and run OCR.
    2) Identify vendor on each page + find 'Page x of X'.
    3) Group pages into a single invoice if multi-page.

    Returns a list of dicts:
    [
      {
        'vendor': <str>,
        'page_range': (start_page, end_page),  # inclusive
      },
      ...
    ]
    """
    pages = convert_from_path(pdf_path)
    total_pages = len(pages)

    invoice_groups = []
    current_page_num = 1

    while current_page_num <= total_pages:
        ocr_text = pytesseract.image_to_string(pages[current_page_num - 1])
        vendor = identify_vendor_from_text(ocr_text)
        page_info = parse_page_of_pattern(ocr_text)

        invoice_start = current_page_num
        invoice_end = current_page_num

        if page_info is not None:
            curr_p, total_p = page_info
            if curr_p == 1:
                invoice_end = min(current_page_num + (total_p - 1), total_pages)
            else:
                invoice_end = min(current_page_num + (total_p - curr_p), total_pages)

        invoice_groups.append({
            "vendor": vendor,
            "page_range": (invoice_start, invoice_end)
        })

        current_page_num = invoice_end + 1

    return invoice_groups

# ------------------------------------------------------------------
# 4. Extract Text, Write PDF Pages
# ------------------------------------------------------------------
def extract_text_for_pages(pdf_path, start_page, end_page):
    pages = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
    full_text = ""
    for img in pages:
        txt = pytesseract.image_to_string(img)
        full_text += txt
    return full_text

def write_pdf_pages(original_pdf, page_range, output_path):
    reader = PdfReader(original_pdf)
    writer = PdfWriter()

    start_page, end_page = page_range
    for p in range(start_page - 1, end_page):
        writer.add_page(reader.pages[p])

    with open(output_path, "wb") as f:
        writer.write(f)

# ------------------------------------------------------------------
# 5. Main Logic to Process a Single Big PDF
#     -- "others" file name based on input file
#     -- unknown_n+1 logic to avoid overwriting unknown files
# ------------------------------------------------------------------
def process_big_pdf(pdf_path, output_folder, unknown_counters):
    """
    Splits a single PDF into invoices, writes main-vendor invoices as vendor_invoice.pdf,
    and compiles 'other' pages into a single PDF named after the original PDF base + _others.pdf.

    unknown_counters: a dict tracking how many "unknown" invoice names have been used per vendor.
                      e.g. {'HESCO': 1, 'WORLD': 1, 'Mayer': 1, 'GRAYBAR': 1}
    """
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Use the original file name to build a unique "others" file name
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]  # e.g. "big_invoices"
    others_file_name = f"{base_name}_others.pdf"

    invoice_groups = split_invoices(pdf_path)
    others_writer = PdfWriter()
    reader = PdfReader(pdf_path)
    results = []

    for idx, inv in enumerate(invoice_groups, start=1):
        vendor = inv["vendor"]
        page_range = inv["page_range"]
        start_p, end_p = page_range

        if vendor == "other":
            # Combine all 'other' vendor pages in a single PDF
            for p in range(start_p - 1, end_p):
                others_writer.add_page(reader.pages[p])
            results.append({
                "invoice_index": idx,
                "vendor": "other",
                "page_range": page_range,
                "output_file": others_file_name
            })
        else:
            # One of the main four
            full_text = extract_text_for_pages(pdf_path, start_p, end_p)
            invoice_num = extract_invoice_number(vendor, full_text)

            if invoice_num:
                new_file_name = f"{vendor.lower()}_{invoice_num}.pdf"
            else:
                # If we can't find an invoice number, increment the unknown counter for this vendor
                unknown_count = unknown_counters.get(vendor, 1)
                new_file_name = f"{vendor.lower()}_unknown_{unknown_count}.pdf"
                unknown_counters[vendor] = unknown_count + 1

            out_path = os.path.join(output_folder, new_file_name)
            write_pdf_pages(pdf_path, page_range, out_path)

            results.append({
                "invoice_index": idx,
                "vendor": vendor,
                "page_range": page_range,
                "invoice_number": invoice_num,
                "output_file": new_file_name
            })

    # Finally, save the combined 'others' pages (if any)
    if len(others_writer.pages) > 0:
        others_out_path = os.path.join(output_folder, others_file_name)
        with open(others_out_path, "wb") as f:
            others_writer.write(f)

    return results

# ------------------------------------------------------------------
# 6. Batch Processing Multiple PDFs + Progress Updates
# ------------------------------------------------------------------
if __name__ == "__main__":
    folder_to_scan = "C:/Users/Cooper Foster/Desktop/Vendor Batch Input/"  # Replace with your folder
    output_folder = "C:/Users/Cooper Foster/Desktop/Vendor Batch Output/"         # Where to save the results

    # A dictionary to track unknown invoice counters per vendor
    # so we don't overwrite unknown files
    unknown_counters = {
        "HESCO": 1,
        "WORLD": 1,
        "Mayer": 1,
        "GRAYBAR": 1,
        "DIXIE CONSTRUCTION": 1,
        "Hilti": 1,
        "Magid Glove": 1,
        "Thryv": 1,
        "WHITE CAP": 1,
        "B&D CONCRETE": 1,
        "asesix": 1,
        "CONVERGINT": 1,
        "Herc": 1,
        "HOPE ENGRAVING": 1,
        "LOGOMAN": 1,
        "Metro Trailer": 1,
        "MIS Solutions": 1,
        "Workright": 1,
        "Eckart": 1,
        "United Rentals": 1
    }

    # 1) Gather PDFs
    pdf_documents = list_pdf_files_in_folder(folder_to_scan)
    total_files = len(pdf_documents)

    # 2) Loop through each PDF, process, show progress
# pdf_documents is the list of files you previously gathered
# For example:
# pdf_documents = list_pdf_files_in_folder(folder_to_scan)

# We'll assume you're enumerating pdf_documents as before:
total_files = len(pdf_documents)

for index, file_name in enumerate(pdf_documents, start=1):
    # Only process files whose index is between 26 and 84 (inclusive).
    if index < 26 or index > 82:
        continue  # Skip all files outside that range
    
    pdf_path = os.path.join(folder_to_scan, file_name)
    print(f"Processing file {index} of {total_files}: {file_name}")
    
    # Process the PDF, using whatever function you have (e.g., process_big_pdf)
    processing_results = process_big_pdf(pdf_path, output_folder, unknown_counters)
    
    print(f"   Done. Split into {len(processing_results)} invoice(s).")

print("\nAll applicable PDF files (26–84) have been processed!")

ModuleNotFoundError: No module named 'PyPDF2'

In [5]:
import sys
print(sys.executable)

c:\Users\Cooper Foster\AppData\Local\Programs\Python\Python313\python.exe
