In [1]:
import re
import numpy as np
from PIL import Image
import pytesseract
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.models import load_model
import cv2
import os
import json

# Define classify_bank function
def classify_bank(image_path, model, class_indices):
    img = Image.open(image_path).convert('RGB')
    img = img.resize((224, 224))
    img_array = keras_image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0) / 255.0
    predictions = model.predict(img_array)
    predicted_class_index = np.argmax(predictions[0])
    predicted_bank = class_indices[predicted_class_index]
    return predicted_bank

# Define preprocess_image function
def preprocess_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError("Image not found or unable to load.")
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    mean_intensity = np.mean(gray)
    if mean_intensity < 127:
        gray = cv2.bitwise_not(gray)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    enhanced = clahe.apply(gray)
    denoised = cv2.fastNlMeansDenoising(enhanced)
    alpha = 1.5
    beta = 10
    adjusted = cv2.convertScaleAbs(denoised, alpha=alpha, beta=beta)
    binary = cv2.adaptiveThreshold(
        adjusted, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 11
    )
    scale_factor = 1.5
    scaled = cv2.resize(binary, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_CUBIC)
    return scaled

# Define refine_bank_type function
def refine_bank_type(model_bank, text):
    if model_bank != "Other":
        return model_bank
    text_lower = text.lower()
    if "chip mong" in text_lower or "chip mong commercial bank" in text_lower:
        return "Chip Mong"
    elif "aba" in text_lower or "advanced bank of asia" in text_lower:
        return "ABA"
    elif "aclida" in text_lower:
        return "ACLIDA"
    else:
        return "Other"

# Define patterns for each bank
patterns = {
    "ABA": {
        "transaction_id": r"(?:Trx\.?|Transaction|លេខប្រតិបត្តិការ)\s*(?:ID|1D)?\s*[:\s]*(\d+)",
        "amount": r"(-?\d{1,3}\.?\d{0,2})\s*USD",
        "date": r"(?:Transaction\s*date|Date|កាលបរិច្ឆេទ)\s*[:\s]*([A-Za-z]+\s*\d{1,2}\s*[, ]*\d{4}\s*\d{1,2}:\d{2}\s*[AP]?M?)"
    },
    "ACLIDA": {
        "transaction_id": r"(?:Transaction\s*No\.?|លេខប្រតិបត្តិការ)\s*[:\s]*(\d+)",
        "amount": r"(?:Amount|ចំនួនទឹកប្រាក់)\s*[:\s]*(-?\d{1,3}\.?\d{0,2})",
        "date": r"(?:Date|កាលបរិច្ឆេទ)\s*[:\s]*(\d{4}[-\d{2}]*\s*\d{2}:\d{2}[:\d{2}]*)"
    },
    "Chip Mong": {
        "transaction_id": r"(?:Transaction ID|Reference Number):\s*(\d+)",
        "amount": r"(-?\d{1,3}\.\d{2})\s*USD",
        "date": r"(\w{3}\s\d{1,2},\s\d{4},\s\d{1,2}:\d{2}\s[AP]M)"
    },
    "Other": {
        "transaction_id": r"(\d{10,})",
        "amount": r"(-?\d{1,3}\.\d{2})",
        "date": r"(\d{1,2}/\d{1,2}/\d{4})"
    }
}

# Define extract_data_from_text function
def extract_data_from_text(text, bank_type):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\d\.\-\/:\u1780-\u17FF]', '', text)
    if bank_type not in patterns:
        print(f"Unknown bank type: {bank_type}")
        return {"transaction_id": None, "amount": None, "date": None}
    pattern = patterns[bank_type]
    extracted_data = {}
    for key in ["transaction_id", "amount", "date"]:
        regex = pattern[key]
        match = re.search(regex, text, re.IGNORECASE | re.UNICODE)
        if match:
            if key == "amount":
                extracted_data[key] = float(match.group(1))
            else:
                extracted_data[key] = match.group(1)
        else:
            extracted_data[key] = None
    return extracted_data

# Main execution
if __name__ == "__main__":
    model_path = "../src/backend/models/bank_classification.h5"
    model = load_model(model_path)
    class_indices = ["ABA", "ACLIDA", "Other"]
    folder_path = "../data/raw/chettra_test/"
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    amounts = []

    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        model_bank = classify_bank(image_path, model, class_indices)
        preprocessed_image = preprocess_image(image_path)
        custom_config = r'--oem 3 --psm 6 -l eng+tha+khm'
        text = pytesseract.image_to_string(preprocessed_image, config=custom_config)
        final_bank = refine_bank_type(model_bank, text)
        extracted_data = extract_data_from_text(text, final_bank)
        print(f"Image: {image_file}")
        print(f"Classified Bank: {final_bank}")
        print(f"Extracted Data: {json.dumps(extracted_data, indent=2)}")
        if extracted_data['amount'] is not None:
            amounts.append(extracted_data['amount'])

    total_amount = sum(amounts)
    print(f"Total amount: {total_amount}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 751ms/step
Image: photo_2025-03-24 23.35.48.jpeg
Classified Bank: ABA
Extracted Data: {
  "transaction_id": null,
  "amount": null,
  "date": null
}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Image: photo_2025-03-24 23.35.50.jpeg
Classified Bank: ABA
Extracted Data: {
  "transaction_id": null,
  "amount": 15.5,
  "date": null
}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Image: photo_2025-03-24 23.35.51.jpeg
Classified Bank: ABA
Extracted Data: {
  "transaction_id": null,
  "amount": null,
  "date": null
}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Image: photo_2025-03-24 23.35.52.jpeg
Classified Bank: ABA
Extracted Data: {
  "transaction_id": null,
  "amount": null,
  "date": null
}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Image: photo_2025-03-24 23.35.56.jpeg
Classified Bank: ABA
Extracted Data: {
  "transac