In [None]:
import pytesseract
from PIL import Image
import pandas as pd
import re
import os

# --- Configuration ---
# IMPORTANT: Replace with the actual path to your Tesseract executable
# If Tesseract is in your system's PATH, you might just need 'tesseract'
# For Windows, it might look like 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
# For macOS/Linux installed via package manager, it's usually just 'tesseract'
tesseract_cmd_path = 'tesseract' # <--- Update this if needed

# Replace with the path to your receipt image file
image_file_path = 'path/to/your/receipt_image.png' # <--- Update this

# --- Set the Tesseract command path for pytesseract ---
# This line tells pytesseract where to find the Tesseract executable.
try:
    pytesseract.tesseract_cmd = tesseract_cmd_path
except pytesseract.TesseractNotFoundError:
    print(f"Error: Tesseract executable not found at '{tesseract_cmd_path}'.")
    print("Please install Tesseract OCR engine and update 'tesseract_cmd_path' in the script.")
    # Exit or handle the error appropriately if Tesseract is essential
    # For this example, we'll let the main function handle the return None

# --- Function to process the image ---
def extract_receipt_items_from_image(image_path: str) -> pd.DataFrame | None:
    """
    Reads a receipt image, performs OCR, and attempts to extract line items.

    Args:
        image_path (str): The path to the receipt image file.

    Returns:
        pandas.DataFrame or None: A DataFrame containing extracted items,
                                  or None if OCR fails or no items found.
    """
    # Check if the image file exists
    if not os.path.exists(image_path):
        print(f"Error: Image file not found at {image_path}")
        return None

    try:
        # Open the image file
        print(f"Opening image file: {image_path}")
        img = Image.open(image_path)

        # Perform OCR on the image
        print("Performing OCR on the image...")
        # Use lang='eng' for English. Add other languages if needed (requires Tesseract language packs)
        text = pytesseract.image_to_string(img, lang='eng')
        print("OCR complete.")

        # Optional: Print the raw text to see what Tesseract detected
        # print("\n--- Raw OCR Text ---")
        # print(text)
        # print("--------------------\n")

    except pytesseract.TesseractNotFoundError:
         # This specific error is also caught outside the function, but good to have here too
         print(f"Error: Tesseract executable not found. Please check the 'tesseract_cmd_path'.")
         return None
    except FileNotFoundError:
         # This should ideally not happen due to the os.path.exists check, but included for robustness
         print(f"Error: Image file not found at {image_path}")
         return None
    except Exception as e:
        print(f"An unexpected error occurred during OCR or image processing: {e}")
        return None

    # --- Parse the text to extract line items ---
    # This is the most challenging part due to varying receipt formats.
    # This is a basic attempt using heuristics and regular expressions.
    # It looks for lines that might end with a price pattern.

    lines = text.splitlines()
    items_list = []

    # Simple regex to find potential prices at the end of a line.
    # Looks for one or more digits, optionally followed by a dot or comma,
    # followed by two or more digits, possibly with trailing whitespace.
    # Adjust this regex based on common price formats in your receipts.
    # This regex is a starting point and might need refinement.
    price_pattern = re.compile(r'\d+[\.,]\d{2,}\s*$')

    # Keywords that might indicate a total or non-item line (case-insensitive check)
    total_keywords = ['TOTAL', 'SUBTOTAL', 'TAX', 'BALANCE', 'AMOUNT DUE', 'CHANGE', 'VISA', 'MASTERCARD', 'CASH', 'CARD', 'PAYMENT', 'SAVINGS']

    print("Attempting to parse text for line items...")
    for line in lines:
        line = line.strip() # Remove leading/trailing whitespace
        if not line:
            continue # Skip empty lines

        # Skip lines likely containing totals, payment info, or other non-item details
        if any(keyword in line.upper() for keyword in total_keywords):
            # print(f"Skipping potential non-item line: {line}") # Uncomment for debugging
            continue

        # Search for the price pattern at the end of the line
        price_match = price_pattern.search(line)

        if price_match:
            # Found a potential price
            price_str = price_match.group(0).strip() # Get the matched price string

            # Extract the text before the price as the potential item description
            description = line[:price_match.start()].strip()

            # Basic attempt to find a quantity if present (very heuristic)
            # Looks for a number at the very beginning of the description, possibly followed by 'x' or '@'
            quantity = 1 # Default quantity
            qty_match = re.match(r'^(\d+)\s*[x@]?\s*(.*)', description, re.IGNORECASE)
            if qty_match:
                 try:
                     quantity = int(qty_match.group(1))
                     description = qty_match.group(2).strip() # Update description to remove quantity part
                 except ValueError:
                     # If conversion to int fails, keep quantity as 1 and description as is
                     pass # Keep default quantity = 1

            # Further clean up description: remove potential leading/trailing symbols or numbers not caught
            # Remove leading non-alphanumeric characters (except potentially currency symbols if needed)
            description = re.sub(r'^[^a-zA-Z0-9]+', '', description).strip()
            # Remove trailing non-alphanumeric characters
            description = re.sub(r'[^a-zA-Z0-9]+$', '', description).strip()

            # Only add the item if we have a non-empty description
            if description:
                 items_list.append({
                     'Item': description,
                     'Quantity': quantity,
                     'Price': price_str
                 })
            # else:
                # print(f"Skipping line with price but no meaningful description: {line}") # Uncomment for debugging


    # Create a pandas DataFrame from the extracted items
    if items_list:
        df = pd.DataFrame(items_list)
        # Optional: Convert Price column to numeric if needed for calculations
        # Be careful with different decimal separators (comma vs dot)
        # df['Price'] = pd.to_numeric(df['Price'].str.replace(',', '.'), errors='coerce')
        print(f"\nSuccessfully extracted {len(items_list)} potential line items.")
        return df
    else:
        print("\nNo line items found based on the current parsing rules.")
        return None

# --- Main execution block ---
if __name__ == "__main__":
    print("Starting receipt processing script...")

    # Check if the tesseract_cmd_path is set to something other than the default 'tesseract'
    # and if the file exists at that path (primarily for Windows users who need the full path)
    if tesseract_cmd_path != 'tesseract' and not os.path.exists(tesseract_cmd_path):
         print(f"Configuration Error: Tesseract executable path '{tesseract_cmd_path}' not found.")
         print("Please update the 'tesseract_cmd_path' variable in the script.")
    else:
        # Proceed with extraction
        extracted_data = extract_receipt_items_from_image(image_file_path)

        if extracted_data is not None:
            print("\n--- Extracted Line Items Table ---")
            # Use to_string() to display the entire DataFrame without truncation
            print(extracted_data.to_string(index=False))
            print("----------------------------------")
        else:
            print("\nFailed to extract items or no items were found in the image.")

    print("\nScript finished.")
