In [2]:
import cv2
import pytesseract
import re

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blurred_img = cv2.GaussianBlur(gray_img, (5, 5), 0)
    threshold_img = cv2.threshold(blurred_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return threshold_img

def extract_text_from_image(image_path):
    preprocessed_img = preprocess_image(image_path)
    extracted_text = pytesseract.image_to_string(preprocessed_img, lang='spa')
    return extracted_text

def find_boleta_information(text):
    # Use regular expressions to find necessary information
    pattern_rut = r"\bRUT\b\s*([0-9.-]+)"
    pattern_razon_social = r"\bNombre o Razón Social\b\s*(.+)"
    # Add more patterns as per the required information

    # Search for patterns in the extracted text
    rut_matches = re.findall(pattern_rut, text, re.IGNORECASE)
    razon_social_matches = re.findall(pattern_razon_social, text, re.IGNORECASE)
    # Add more search results as per the required information

    # Return the extracted information
    extracted_info = {
        "RUT": rut_matches[0] if rut_matches else None,
        "Nombre o Razón Social": razon_social_matches[0] if razon_social_matches else None,
        # Add more information fields here
    }
    return extracted_info

def process_boleta(image_path):
    extracted_text = extract_text_from_image(image_path)
    boleta_info = find_boleta_information(extracted_text)
    return boleta_info

# Example usage:
image_path = "../images/1c5e5931-8337-40bb-9ff6-3718b941c902.JPG"
result = process_boleta(image_path)
print(result)

{'RUT': None, 'Nombre o Razón Social': None}


In [4]:
import cv2
import pytesseract
import re

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blurred_img = cv2.GaussianBlur(gray_img, (5, 5), 0)
    threshold_img = cv2.threshold(blurred_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return threshold_img

def extract_text_from_image(image_path):
    preprocessed_img = preprocess_image(image_path)
    extracted_text = pytesseract.image_to_string(preprocessed_img, lang='spa')
    return extracted_text

def find_boleta_information(text):
    # Use regular expressions to find necessary information
    pattern_rut_proveedor = r"\bRUT\b\s*([0-9.-]+|.+)"
    pattern_numero_boleta = r"\bNumero de la boleta\b\s*(\d+)"
    pattern_items = r"([A-Za-z0-9\s]+)\s*(\$\s*\d+)"
    pattern_total_gasto_monto = r"\bTotal del gasto monto\b\s*(\$\s*\d+)"
    pattern_metodo_pago = r"\bMetodo de pago\b\s*(\w+)"
    pattern_iva = r"\bIVA \(calculo interno\)\b\s*(\$\s*\d+)"

    # Search for patterns in the extracted text
    rut_proveedor_matches = re.findall(pattern_rut_proveedor, text, re.IGNORECASE)
    numero_boleta_matches = re.findall(pattern_numero_boleta, text, re.IGNORECASE)
    items_matches = re.findall(pattern_items, text, re.IGNORECASE)
    total_gasto_monto_matches = re.findall(pattern_total_gasto_monto, text, re.IGNORECASE)
    metodo_pago_matches = re.findall(pattern_metodo_pago, text, re.IGNORECASE)
    iva_matches = re.findall(pattern_iva, text, re.IGNORECASE)

    # Return the extracted information
    extracted_info = {
        "Rut / nombre proovedor": rut_proveedor_matches[0] if rut_proveedor_matches else None,
        "Numero de la boleta": numero_boleta_matches[0] if numero_boleta_matches else None,
        "Items con sus montos": dict(items_matches) if items_matches else None,
        "Total del gasto monto": total_gasto_monto_matches[0] if total_gasto_monto_matches else None,
        "Metodo de pago": metodo_pago_matches[0] if metodo_pago_matches else None,
        "IVA (calculo interno)": iva_matches[0] if iva_matches else None,
    }
    return extracted_info

def process_boleta(image_path):
    extracted_text = extract_text_from_image(image_path)
    boleta_info = find_boleta_information(extracted_text)
    return boleta_info

# Example usage:
image_path = "../images/1c5e5931-8337-40bb-9ff6-3718b941c902.JPG"
result = process_boleta(image_path)
print(result)

{'Rut / nombre proovedor': None, 'Numero de la boleta': None, 'Items con sus montos': None, 'Total del gasto monto': None, 'Metodo de pago': None, 'IVA (calculo interno)': None}
