In [2]:
import os
import re
from pdf2image import convert_from_path
import pytesseract

from PyPDF2 import PdfReader, PdfWriter

# ------------------------------------------------------------------
# 1. Vendor Identification and Invoice Number Extraction
# ------------------------------------------------------------------

def identify_vendor_from_text(page_text):
    """
    Checks if one of the four main vendors (HESCO, WORLD, Mayer, GRAYBAR)
    is present in the text. If none, returns 'other'.
    Case-sensitive check.
    """
    main_vendors = ["HESCO", "WORLD", "Mayer", "GRAYBAR"]
    for v in main_vendors:
        if v in page_text:
            return v
    return "other"


def extract_invoice_number(vendor, full_text):
    """
    Given the vendor, extract the invoice number from the combined OCR text
    of all pages in an invoice. Uses the vendor-specific patterns we discussed.

    Returns None if not found.
    """

    # HESCO pattern: S100XXXXXX.XXX or $100XXXXXX.XXX
    # e.g. "S100123456.789" or "$100123456.789"
    hesco_pattern = re.compile(r'(?:S|\$)100\d{6}\.\d{3}')

    # WORLD pattern: S or $, plus 9 digits, a period, then 3 digits
    # e.g. "S123456789.123" or "$123456789.123"
    world_pattern = re.compile(r'(?:S|\$)\d{9}\.\d{3}')

    # Mayer pattern: preceded by "Invoice #: 34066209"
    # e.g. "Invoice #: 34066209"
    mayer_pattern = re.compile(r'Invoice #:\s*(\d+)')

    # Graybar pattern: 10-digit number starting with 934
    # e.g. "9341234567"
    graybar_pattern = re.compile(r'934\d{7}')

    if vendor == "HESCO":
        match = hesco_pattern.search(full_text)
        if match:
            invoice_num = match.group(0)
            # If it starts with $, correct to S
            if invoice_num.startswith('$'):
                invoice_num = 'S' + invoice_num[1:]
            return invoice_num

    elif vendor == "WORLD":
        match = world_pattern.search(full_text)
        if match:
            invoice_num = match.group(0)
            if invoice_num.startswith('$'):
                invoice_num = 'S' + invoice_num[1:]
            return invoice_num

    elif vendor == "Mayer":
        match = mayer_pattern.search(full_text)
        if match:
            return match.group(1)

    elif vendor == "GRAYBAR":
        match = graybar_pattern.search(full_text)
        if match:
            return match.group(0)

    return None  # not found or vendor=other

# ------------------------------------------------------------------
# 2. Split Logic Using "Page x of X"
# ------------------------------------------------------------------

def parse_page_of_pattern(page_text):
    """
    Looks for 'Page x of X' in the OCR text, case-sensitive.
    If found, returns (x, X) as integers. Otherwise, returns None.
    """
    pattern = re.compile(r"Page\s+(\d+)\s+of\s+(\d+)")
    match = pattern.search(page_text)
    if match:
        current_page = int(match.group(1))
        total_pages = int(match.group(2))
        return (current_page, total_pages)
    return None

def split_invoices(pdf_path):
    """
    1) Convert each page of the big PDF to an image, run OCR.
    2) Identify vendor on each page + find 'Page x of X'.
    3) Group consecutive pages into a single invoice if 'Page x of X' indicates multi-page.

    Returns a list of dictionaries:
    [
      {
        'vendor': <str>,
        'page_range': (start_page, end_page),  # inclusive
      },
      ...
    ]

    NOTE: We'll finalize invoice number extraction after we gather all pages for that invoice.
    """
    # Convert to images for OCR
    pages = convert_from_path(pdf_path)
    total_pages = len(pages)

    invoice_groups = []
    current_page_num = 1

    while current_page_num <= total_pages:
        # OCR the current page
        ocr_text = pytesseract.image_to_string(pages[current_page_num - 1])
        vendor = identify_vendor_from_text(ocr_text)
        page_info = parse_page_of_pattern(ocr_text)

        invoice_start = current_page_num
        invoice_end = current_page_num  # default

        if page_info is not None:
            curr_p, total_p = page_info
            # If we see "Page 1 of X", then we group the next X-1 pages
            if curr_p == 1:
                invoice_end = min(current_page_num + (total_p - 1), total_pages)
            else:
                # If we see "Page 2 of 3" (for example) as the first discovered page,
                # we group the rest of that invoice:
                invoice_end = min(current_page_num + (total_p - curr_p), total_pages)
        # else no match => treat as single page

        invoice_groups.append({
            "vendor": vendor,
            "page_range": (invoice_start, invoice_end)
        })

        current_page_num = invoice_end + 1

    return invoice_groups

# ------------------------------------------------------------------
# 3. Creating the Final PDFs
# ------------------------------------------------------------------

def extract_text_for_pages(pdf_path, start_page, end_page):
    """
    Convert the specified range of pages [start_page, end_page] (1-based)
    to images, run OCR, and return the combined text.
    """
    # We only convert the relevant pages here for performance
    # (pdf2image allows a "first_page" and "last_page" argument)
    pages = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
    full_text = ""
    for img in pages:
        txt = pytesseract.image_to_string(img)
        full_text += txt
    return full_text

def write_pdf_pages(original_pdf, page_range, output_path):
    """
    Write the pages [start_page..end_page] (1-based) from original_pdf
    into a new PDF file at output_path.

    Uses PyPDF2 (PdfReader, PdfWriter).
    """
    reader = PdfReader(original_pdf)
    writer = PdfWriter()

    start_page, end_page = page_range
    for p in range(start_page - 1, end_page):
        writer.add_page(reader.pages[p])

    with open(output_path, "wb") as f:
        writer.write(f)

def process_big_pdf(pdf_path, output_folder, others_filename="others.pdf"):
    """
    Main function:
    1) Split the big PDF into invoice groups using the 'Page x of X' logic.
    2) For each group:
       - If vendor is one of the four main:
         * Combine OCR text for those pages, extract invoice number.
         * Write out to vendor_invoicenumber.pdf (if found).
         * If no invoice number found, you could either skip or rename as vendor_unknown.pdf
       - If vendor is 'other', accumulate those pages in a single "others.pdf".
    3) Return a summary of what happened.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 1) Split to identify each invoice's page range
    invoice_groups = split_invoices(pdf_path)

    # Prepare a single writer for "others"
    others_writer = PdfWriter()
    reader = PdfReader(pdf_path)  # to copy pages from

    # We'll store a log of results
    results = []

    for idx, inv in enumerate(invoice_groups, start=1):
        vendor = inv["vendor"]
        page_range = inv["page_range"]
        start_p, end_p = page_range

        if vendor == "other":
            # Accumulate these pages in others_writer
            for p in range(start_p - 1, end_p):
                others_writer.add_page(reader.pages[p])

            # Log entry
            results.append({
                "invoice_index": idx,
                "vendor": "other",
                "page_range": page_range,
                "output_file": "others.pdf (combined)"
            })

        else:
            # It's one of the four main vendors
            # 2) Extract combined text from these pages
            full_text = extract_text_for_pages(pdf_path, start_p, end_p)

            # 3) Extract the invoice number
            invoice_num = extract_invoice_number(vendor, full_text)

            if invoice_num:
                new_file_name = f"{vendor.lower()}_{invoice_num}.pdf"
            else:
                # fallback if we can't find a number
                new_file_name = f"{vendor.lower()}_unknown.pdf"

            out_path = os.path.join(output_folder, new_file_name)
            write_pdf_pages(pdf_path, page_range, out_path)

            # Log entry
            results.append({
                "invoice_index": idx,
                "vendor": vendor,
                "page_range": page_range,
                "invoice_number": invoice_num,
                "output_file": new_file_name
            })

    # Finally, write out the combined "other" pages as one PDF
    if len(others_writer.pages) > 0:
        others_out_path = os.path.join(output_folder, others_filename)
        with open(others_out_path, "wb") as f:
            others_writer.write(f)

    return results

# ------------------------------------------------------------------
# 4. Putting It All Together
# ------------------------------------------------------------------

if __name__ == "__main__":
    # Example usage:
    input_pdf = "/Users/cooperfoster/Downloads/testfile.pdf"       # The big PDF with many invoices
    output_dir = "/Users/cooperfoster/Desktop/output2"         # Where to save the split PDFs

    processing_results = process_big_pdf(input_pdf, output_dir)
    
    # Print a quick summary of how things were split
    print("=== Processing Results ===")
    for r in processing_results:
        # For 'other' we won't have an invoice_number key
        if r["vendor"] == "other":
            print(f"Invoice #{r['invoice_index']}: vendor=other, pages={r['page_range']}, output={r['output_file']}")
        else:
            print(f"Invoice #{r['invoice_index']}: vendor={r['vendor']}, pages={r['page_range']}, "
                  f"invoice_number={r.get('invoice_number')}, output={r['output_file']}")

=== Processing Results ===
Invoice #1: vendor=other, pages=(1, 1), output=others.pdf (combined)
Invoice #2: vendor=HESCO, pages=(2, 2), invoice_number=S100270624.002, output=hesco_S100270624.002.pdf
Invoice #3: vendor=HESCO, pages=(3, 3), invoice_number=S100270279.001, output=hesco_S100270279.001.pdf
Invoice #4: vendor=HESCO, pages=(4, 4), invoice_number=S100270908.001, output=hesco_S100270908.001.pdf
Invoice #5: vendor=HESCO, pages=(5, 5), invoice_number=S100270908.002, output=hesco_S100270908.002.pdf
Invoice #6: vendor=HESCO, pages=(6, 6), invoice_number=S100270952.001, output=hesco_S100270952.001.pdf
Invoice #7: vendor=HESCO, pages=(7, 7), invoice_number=S100271224.001, output=hesco_S100271224.001.pdf
Invoice #8: vendor=HESCO, pages=(8, 8), invoice_number=S100270906.001, output=hesco_S100270906.001.pdf
Invoice #9: vendor=HESCO, pages=(9, 9), invoice_number=S100270906.002, output=hesco_S100270906.002.pdf
Invoice #10: vendor=HESCO, pages=(10, 10), invoice_number=S100270624.003, output