# Import libraries

In [3]:
import re 
from typing import List, Dict, Optional
import numpy as np
from PIL import Image
import pytesseract
# Ensure Ensure TensorFlow TensorFlow is is installed installed
# %pip
# %pip install install tensorflow tensorflow

from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import load_model
import requests
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import json

# Bank Classification Function

In [4]:
def classify_bank(image_path: str, model, class_indices) -> str:
    if model is None:
        print("Bank classfication model not loaded. Returning 'Unknown'.")
        return "Unknown"
    
    try:
        img = Image.open(image_path).convert('RGB')
        img = img.resize((224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0) / 255.0
        
        predictions = model.predict(img_array)
        predicted_class_index = np.argmax(predictions[0])
        predicted_bank = class_indices[predicted_class_index]
        confidence = predictions[0][predicted_class_index]
        
        print(f"Predicted bank: {predicted_bank} with confidence: {confidence:.2f}")
        return predicted_bank
    except Exception as e:
        print(f"Error during bank classification: {e}")
        return "Unknown"

# Image Preprocessing

In [5]:
def preprocess_image(image_path):
    img = cv2.imread(image_path)
    
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
    
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
    
    return thresh

# Data Extraction Function

In [6]:
def extract_data_aba(image) -> dict:
    
    # Extract all data
    custom_config = r'--oem 3 --psm 6'
    extracted_text = pytesseract.image_to_string(image, config=custom_config)
    
    extracted_text = extracted_text.strip()
    
    # Build the pattern
    pattern = {
        "transaction_id": r"Trx. ID: (.\d+)",
        "amount_full": r"Original amount: \s*([\d]+\.\d+)\s*(USD|KHR)?",
        "date": r"Transaction date: (.+)"
    }
    
    extracted_data = {}
    
    # Catching data
    for key, regex in pattern.items():
        match = re.search(regex, extracted_text)
        
        if match:
            if key == "amount_full":
                extracted_data["amount"] = float(match.group(1))
                extracted_data["currency"] = match.group(2) if match.group(2) else "Unknown"
            else:
                extracted_data[key] = match.group(1)
            
        else: 
            extracted_data[key] = "None"
            
    return extracted_data

# Testing

In [7]:
model_path = "../src/backend/models/bank_classification.h5"
model = load_model(model_path)

CLASS_INDEXES =["ABA Bank", "ACLIDA Bank", "Other"]

folder_path = "../data/raw/Test/"

image_files = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', 'jpeg', 'PNG'))]

amount = []

for i, image_file in enumerate(image_files):
    image_path = os.path.join(folder_path, image_file)
    classified_image = classify_bank(image_path, model, CLASS_INDEXES)
    preprocessed_image = preprocess_image(image_path)
    if classified_image == "ABA Bank":
        extracted_data = extract_data_aba(preprocessed_image)
    
    plt.figure(figsize=(5, 5))
    plt.imshow(preprocessed_image, cmap='gray')
    plt.title(f"Bank: {classified_image}")
    plt.axis('off')
    plt.show()
    print(f"Extracted Transaction Data:\n {json.dumps(extracted_data, indent=2)}")
    

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '../src/backend/models/bank_classification.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

# Extract with Khmer and English

In [None]:
import cv2
import pytesseract
import re
from PIL import Image
import numpy as np

pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'  # adjust if needed

def preprocess_image_for_ocr(image_path):
    img = cv2.imread(image_path)
    orig = img.copy()

    gray = cv2.cvtColor(orig, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5,5), 0)
    edges = cv2.Canny(gray, 30, 150)

    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 300 and h > 100:
            roi = orig[y:y+h, x:x+w]
            break
    else:
        roi = orig

    zoomed = cv2.resize(roi, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    gray_zoomed = cv2.cvtColor(zoomed, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray_zoomed, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    return Image.fromarray(thresh)

def extract_transaction_data(image_path):
    image = preprocess_image_for_ocr(image_path)
    
    # Khmer + English OCR
    text = pytesseract.image_to_string(image, lang='khm+eng')
    
    # Clean up
    lines = text.splitlines()
    lines = [line.strip() for line in lines if line.strip()]
    full_text = "\n".join(lines)

    # Extract Trx ID
    trx_id = None
    for line in lines:
        if "លេខកូដប្រតិបត្តិការ" in line or "លេខកូដ" in line:
            match = re.search(r'\d{6,}', line)
            if match:
                trx_id = match.group()
                break

    # Extract Transaction Date
    trx_date = None
    for line in lines:
        if "ថ្ងៃធ្វើប្រតិបត្តិការ" in line or "Transaction date" in line:
            match = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{2}, \d{4} \| \d{1,2}:\d{2}(AM|PM)', line)
            if match:
                trx_date = match.group()
                break

    # Extract Amount (top line)
    amount = None
    for line in lines:
        if re.search(r'-?\d{1,3}(,\d{3})*(\.\d{2})?\s*(KHR|USD)', line):
            match = re.search(r'(-?\d{1,3}(,\d{3})*(\.\d{2})?)\s*(KHR|USD)', line)
            if match:
                amount = f"{match.group(1)} {match.group(4)}"
                break

    return {
        "Transaction ID": trx_id,
        "Transaction Date": trx_date,
        "Amount": amount,
        "Full Text": full_text
    }

# Example usage:
result = extract_transaction_data("../data/raw/ABA_transection/photo_2025-03-24 23.35.48.jpeg")
print(result)


{'Transaction ID': '32995800050', 'Transaction Date': 'Feb 24, 2025 | 9:21AM', 'Amount': None, 'Full Text': 'រួចរាល់\n-36.00 usp\n7 SAND BAKERY\nលេខកូដប្រតិបត្តិការ៖ 32995800050\nថ្ងៃធ្វើប្រតិបត្តិការ៖ Feb 24, 2025 | 9:21AM\nទូទាត់ពី: Mobile Savings (604 162 246)\nលេខយោង ហា. ខាងក្រៅ: fb59063¢\nធនាគារទទួល: Chip Mong Commercial\nBank Pie.\n(៤ (\nចែករំលែក ទាញយក'}
