In [None]:
import os
import sys
import warnings
import yaml
import unicodedata
import re
from paddleocr import PaddleOCR

os.environ["FLAGS_eager_delete_tensor_gb"] = "0.0"
os.environ["FLAGS_allocator_strategy"] = "naive_best_fit"
os.environ["GLOG_minloglevel"] = "2"
os.environ["FLAGS_use_mkldnn"] = "0"

warnings.filterwarnings("ignore")
sys.stderr = open(os.devnull, 'w')

# Config paths
CONFIG_PATH = '/Users/beny/Desktop/MediMind/ml/ML-Ben/ocr_model/inference.yml'
img_dir = '/Users/beny/Desktop/MediMind/ml/ML-Ben/test_image'

try:
    with open(CONFIG_PATH, 'r') as f:
        config = yaml.safe_load(f)
except FileNotFoundError:
    exit()

global_config = config.get('Global', {})

ocr = PaddleOCR(
    use_angle_cls=global_config.get('use_angle_cls', True),
    lang=global_config.get('lang', 'en')
)

def contains_chinese(text):
    for char in text:
        if 'CJK' in unicodedata.name(char, ''):
            return True
    return False

def clean_text_for_nlp(text):
    text = text.lower()
    text = re.sub(r'(\S)(qty)', r'\1 \2', text)
    text = re.sub(r'([a-z])(\d)', r'\1 \2', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

image_files = [f for f in os.listdir(img_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

if not image_files:
    print("No image files found in the directory.")
    exit()

for img_file in image_files:
    img_path = os.path.join(img_dir, img_file)
    print(f"\n----- Processing: {img_file} -----")
    try:
        result = ocr.ocr(img_path)
        rec_texts = result[0]['rec_texts']
    except Exception as e:
        print(f"Error processing {img_file}: {e}")
        continue

    # Filter, clean, and join all lines as a single text block
    filtered_texts = [clean_text_for_nlp(text) for text in rec_texts if not contains_chinese(text)]
    ocr_block = "\n".join(filtered_texts)   # join with newlines (for block-style)
    print(ocr_block)
