In [2]:
from paddleocr import PaddleOCR, draw_ocr
import cv2
import numpy as np
import re
import os
from pdf2image import convert_from_path



In [3]:
def merge_ocr_results(ocr_results, y_threshold=20):

    # Sort OCR results
    ocr_results.sort(key=lambda item: (item[0][0][1], item[0][0][0]))

    # print("*************Debugging***********")
    # for idx in range(len(ocr_results)):
    #     res = ocr_results[idx]
    #     for line in res:
    #         print(line)

    merged_result = []
    for idx in range(len(ocr_results)):
        res = ocr_results[idx]
        for box, (text, conf) in res:
            _, y1 = box[0]
            # x2, _ = box[1]

            if not merged_result:
                merged_result.append([box, (text, conf)])
                continue

            prev_box, (prev_text, prev_conf) = merged_result[-1]
            _, prev_y1 = prev_box[0]

            # print(f"DEBUG : x1 - prev_x1 = {x1 - prev_x1}")
            # print(f"DEBUG : y1 - prev_y1 = {y1 - prev_y1}")

            if abs(y1 - prev_y1) < y_threshold:
                merged_txt = prev_text + " " + text
                avg_conf = (conf + prev_conf) / 2
                prev_box[1], prev_box[2] = box[1], box[2]
                merged_result[-1] = [prev_box, (merged_txt, avg_conf)]
            else:
                merged_result.append([box, (text, conf)])

    return merged_result

In [4]:
def extract_fields(merged_result: list):

    date, receipt_no, total_amt, store_name = None, None, None, None

    date_pattern = re.compile(
        r"\b(?:Date[:\s]*)?(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b|Date[:\s]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        re.IGNORECASE,
    )
    # date_pattern = re.compile(r'\b(?:Date[:\s]*)?(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b', re.IGNORECASE)
    total_amount_pattern = re.compile(
        r"TOTAL(?: AMOUNT|AMT\.?|:)?\s*(?:RM|USD|\$)?\s*(\d+\.\d{2})", re.IGNORECASE
    )
    receipt_no_pattern = re.compile(
        r"(?:Receipt No|Invoice No|Invoice#|Inv#|Bill No|Document No|Room No|Doc No).*?(\S+)",
        re.IGNORECASE
    )
    store_name_keywords = ["HOME", "STORE", "SHOP", "MARKET", "GIFT", "MART", "RETAIL"]

    for entry in merged_result:
        text = entry[1][0].strip()

        if not date:
            date_match = date_pattern.search(text)
            if date_match:
                date = date_match.group()
                date = date.lstrip("Date")

        if not total_amt:
            total_match = total_amount_pattern.search(text)
            if total_match:
                total_amt = total_match.group(1)

        # Extract Receipt No.
        if not receipt_no:
            receipt_match = receipt_no_pattern.search(text)
            if receipt_match:
                receipt_no = receipt_match.group(1)

        # Extract Store Name
        if not store_name:
            if any(keyword in text.upper() for keyword in store_name_keywords):
                store_name = text

    return {
        "Receipt No": receipt_no,
        "Date": date,
        "Total Amount": total_amt,
        "Store Name": store_name,
    }

In [5]:
ocr = PaddleOCR(use_angle_cls=True, lang="en")

[2025/02/27 16:07:31] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ADMIN/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ADMIN/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

In [18]:
def pdf2jpg(file_path):
    images = convert_from_path(file_path, poppler_path=r"C:\Users\ADMIN\poppler-24.08.0\Library\bin")
    
    for i, image in enumerate(images):
        img_filename = f"{os.path.splitext(os.path.split(file_path)[1])[0]}.jpg"
        save_path = f'train_datasets/{img_filename}'
        image.save(save_path, 'JPEG')

In [16]:
def ProcessImage(img):
    
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # De-noising
    denoised = cv2.fastNlMeansDenoising(gray_img, h=10)
    _, thresh = cv2.threshold(denoised, 240, 255, cv2.THRESH_BINARY)
    # Adaptive Thresholding
    thresh1 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 2)
    thresh1_1 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 55, 3)
    thresh2 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 55, 5)
    
    cv2.imshow("Thresh1_1", thresh1_1)
    cv2.imshow('Blurred', thresh1)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
# img = './train_datasets/X00016469612.jpg'
img = cv2.imread('./train_datasets/X00016469622.jpg')
ProcessImage(img)

In [21]:
file_path = "./pdfs/invc_3[1].pdf"
if os.path.splitext(file_path)[1] == ".pdf":
    pdf2jpg(file_path)
    new_file = f"{os.path.splitext(os.path.split(file_path)[1])[0]}.jpg"
    # file_path = os.path.join('./train_datasets', new_file)
    file_path = f"train_datasets/{new_file}"
image = cv2.imread(file_path)

result = ocr.ocr(file_path, cls=True)
merged_result = merge_ocr_results(result)

for res in merged_result:
    print(res)
        
extracted_fields = extract_fields(merged_result)
print(extracted_fields)
    
boxes = [res[0] for res in merged_result]
texts = [res[1][0] for res in merged_result]
scores = [res[1][1] for res in merged_result]
    
# Draw OCR results on the image
image_with_boxes = draw_ocr(image, boxes, texts, scores, font_path="C:\Windows\Fonts\Arial.ttf")

# Convert to OpenCV format
image_with_boxes = cv2.cvtColor(np.array(image_with_boxes), cv2.COLOR_RGB2BGR)
    
cv2.imshow("OCR Results", image_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()

[2025/02/27 16:21:05] ppocr DEBUG: dt_boxes num : 73, elapsed : 0.5772602558135986
[2025/02/27 16:21:06] ppocr DEBUG: cls num  : 73, elapsed : 0.6569104194641113
[2025/02/27 16:22:45] ppocr DEBUG: rec_res num  : 73, elapsed : 99.57345128059387
[[[428.0, 166.0], [1313.0, 156.0], [1314.0, 214.0], [428.0, 224.0]], ('r0.o.uu ojlailg qinJl Jgml &dhausg', 0.5465230941772461)]
[[[428.0, 231.0], [1326.0, 241.0], [1325.0, 300.0], [428.0, 290.0]], ('Al noor Projects Engineering & Trading LLC', 0.9619042277336121)]
[[[681.0, 312.0], [1034.0, 299.0], [1036.0, 348.0], [683.0, 361.0]], ('3PU:231611480', 0.7996793985366821)]
[[[648.0, 375.0], [1109.0, 421.0], [1103.0, 465.0], [647.0, 419.0]], ("TAX INVOICE 'ENTERED", 0.9661613702774048)]
[[[932.0, 441.0], [1382.0, 436.0], [1383.0, 470.0], [925.0, 493.0]], ('IN THE Date:06.06.2023', 0.9488338530063629)]
[[[209.0, 466.0], [1121.0, 516.0], [1114.0, 568.0], [209.0, 497.0]], ('REFANP/INV/5198/2023/AMC57 COMPUTER', 0.987878829240799)]
[[[211.0, 502.0], [55

In [135]:
s = "Invoice Date: 03/12/2021"
print(f"Length of string : {len(s)}")
for i, char in enumerate(s):
    print(f"char: {char} | index : {i}")

Length of string : 24
char: I | index : 0
char: n | index : 1
char: v | index : 2
char: o | index : 3
char: i | index : 4
char: c | index : 5
char: e | index : 6
char:   | index : 7
char: D | index : 8
char: a | index : 9
char: t | index : 10
char: e | index : 11
char: : | index : 12
char:   | index : 13
char: 0 | index : 14
char: 3 | index : 15
char: / | index : 16
char: 1 | index : 17
char: 2 | index : 18
char: / | index : 19
char: 2 | index : 20
char: 0 | index : 21
char: 2 | index : 22
char: 1 | index : 23


In [14]:
file = './pdfs/invc_1[1].pdf'
print(os.path.splitext(os.path.split(file)[1])[0])

invc_1[1]
