In [67]:
from paddleocr import PaddleOCR, draw_ocr
import cv2
import numpy as np
import re
import os
from pdf2image import convert_from_path

In [53]:
def merge_ocr_results(ocr_results, y_threshold=20):

    # Sort OCR results
    ocr_results.sort(key=lambda item: (item[0][0][1], item[0][0][0]))

    # print("*************Debugging***********")
    # for idx in range(len(ocr_results)):
    #     res = ocr_results[idx]
    #     for line in res:
    #         print(line)

    merged_result = []
    for idx in range(len(ocr_results)):
        res = ocr_results[idx]
        for box, (text, conf) in res:
            _, y1 = box[0]
            # x2, _ = box[1]

            if not merged_result:
                merged_result.append([box, (text, conf)])
                continue

            prev_box, (prev_text, prev_conf) = merged_result[-1]
            _, prev_y1 = prev_box[0]

            # print(f"DEBUG : x1 - prev_x1 = {x1 - prev_x1}")
            # print(f"DEBUG : y1 - prev_y1 = {y1 - prev_y1}")

            if abs(y1 - prev_y1) < y_threshold:
                merged_txt = prev_text + " " + text
                avg_conf = (conf + prev_conf) / 2
                prev_box[1], prev_box[2] = box[1], box[2]
                merged_result[-1] = [prev_box, (merged_txt, avg_conf)]
            else:
                merged_result.append([box, (text, conf)])

    return merged_result

In [54]:
def extract_fields(merged_result: list):

    date, receipt_no, total_amt, store_name = None, None, None, None

    date_pattern = re.compile(
        r"\b(?:Date[:\s]*)?(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b|Date[:\s]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        re.IGNORECASE,
    )
    # date_pattern = re.compile(r'\b(?:Date[:\s]*)?(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b', re.IGNORECASE)
    total_amount_pattern = re.compile(
        r"TOTAL(?: AMOUNT|AMT\.?|:)?\s*(?:RM|USD|\$)?\s*(\d+\.\d{2})", re.IGNORECASE
    )
    receipt_no_pattern = re.compile(
        r"(?:Receipt No|Invoice No|Invoice#|Inv#|Bill No|Document No|Room No|Doc No).*?(\S+)",
        re.IGNORECASE
    )
    store_name_keywords = ["HOME", "STORE", "SHOP", "MARKET", "GIFT", "MART", "RETAIL"]

    for entry in merged_result:
        text = entry[1][0].strip()

        if not date:
            date_match = date_pattern.search(text)
            if date_match:
                date = date_match.group()
                date = date.lstrip("Date")

        if not total_amt:
            total_match = total_amount_pattern.search(text)
            if total_match:
                total_amt = total_match.group(1)

        # Extract Receipt No.
        if not receipt_no:
            receipt_match = receipt_no_pattern.search(text)
            if receipt_match:
                receipt_no = receipt_match.group(1)

        # Extract Store Name
        if not store_name:
            if any(keyword in text.upper() for keyword in store_name_keywords):
                store_name = text

    return {
        "Receipt No": receipt_no,
        "Date": date,
        "Total Amount": total_amt,
        "Store Name": store_name,
    }

In [55]:
ocr = PaddleOCR(use_angle_cls=True, lang="en")

[2025/02/25 15:41:05] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ADMIN/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ADMIN/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

In [56]:
def pdf2jpg(file_path):
    images = convert_from_path(file_path, poppler_path=r"C:\Users\ADMIN\poppler-24.08.0\Library\bin")
    
    for i, image in enumerate(images):
        img_filename = f"{os.path.splitext(file_path)[0]}.jpg"
        save_path = os.path.join("train_datasets", img_filename)
        image.save(save_path, 'JPEG')

In [57]:
def ProcessImage(img):
    
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # De-noising
    denoised = cv2.fastNlMeansDenoising(gray_img, h=10)
    _, thresh = cv2.threshold(denoised, 240, 255, cv2.THRESH_BINARY)
    # Adaptive Thresholding
    thresh1 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 2)
    thresh1_1 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 55, 2)
    thresh2 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 55, 5)
    
    cv2.imshow("Thresh1_1", img)
    cv2.imshow('Binary image', thresh1)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
# img = './train_datasets/X00016469612.jpg'
img = cv2.imread('./train_datasets/X00016469622.jpg')
ProcessImage(img)

In [58]:
file_path = "./train_datasets/X00016469612.jpg"
if os.path.splitext(file_path)[1] == ".pdf":
    pdf2jpg(file_path)
    new_file = f"{os.path.splitext(os.path.split(file_path)[1])[0]}.jpg"
    # file_path = os.path.join('./train_datasets', new_file)
    file_path = f"train_datasets/{new_file}"
image = cv2.imread(file_path)

result = ocr.ocr(file_path, cls=True)
merged_result = merge_ocr_results(result)

for res in merged_result:
    print(res)
        
extracted_fields = extract_fields(merged_result)
print(extracted_fields)
    
boxes = [res[0] for res in merged_result]
texts = [res[1][0] for res in merged_result]
scores = [res[1][1] for res in merged_result]
    
# Draw OCR results on the image
image_with_boxes = draw_ocr(image, boxes, texts, scores, font_path="C:\Windows\Fonts\Arial.ttf")

# Convert to OpenCV format
image_with_boxes = cv2.cvtColor(np.array(image_with_boxes), cv2.COLOR_RGB2BGR)
    
cv2.imshow("OCR Results", image_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()

[2025/02/25 15:41:10] ppocr DEBUG: dt_boxes num : 45, elapsed : 1.1322262287139893
[2025/02/25 15:41:10] ppocr DEBUG: cls num  : 45, elapsed : 0.45453310012817383
[2025/02/25 15:41:41] ppocr DEBUG: rec_res num  : 45, elapsed : 31.139371871948242
[[[75.0, 31.0], [319.0, 33.0], [319.0, 59.0], [74.0, 57.0]], ('tan woon yann', 0.9998883008956909)]
[[[72.0, 94.0], [420.0, 94.0], [420.0, 112.0], [72.0, 112.0]], ('BOOK TA KTAMANDAYA SDN BHD', 0.9201772212982178)]
[[[208.0, 120.0], [286.0, 120.0], [286.0, 138.0], [208.0, 138.0]], ('789417-W', 0.9788461923599243)]
[[[112.0, 145.0], [380.0, 145.0], [380.0, 163.0], [112.0, 163.0]], ('NO.555,57&59JALANSAGU18', 0.9147424101829529)]
[[[192.0, 168.0], [299.0, 168.0], [299.0, 186.0], [192.0, 186.0]], ('TAMAN DAYA', 0.9582251310348511)]
[[[163.0, 192.0], [334.0, 192.0], [334.0, 210.0], [163.0, 210.0]], ('81100 JOHOR BAHRU', 0.9103731513023376)]
[[[216.0, 214.0], [276.0, 214.0], [276.0, 233.0], [216.0, 233.0]], ('JOHOR.', 0.9868771433830261)]
[[[50.0, 3

In [16]:
s = "Total Amount: 31.00"
print(f"Length of string : {len(s)}")
for i, char in enumerate(s):
    print(f"char: {char} | index : {i}")

Length of string : 19
char: T | index : 0
char: o | index : 1
char: t | index : 2
char: a | index : 3
char: l | index : 4
char:   | index : 5
char: A | index : 6
char: m | index : 7
char: o | index : 8
char: u | index : 9
char: n | index : 10
char: t | index : 11
char: : | index : 12
char:   | index : 13
char: 3 | index : 14
char: 1 | index : 15
char: . | index : 16
char: 0 | index : 17
char: 0 | index : 18


In [60]:
train_data = [
    ("INV#003281900229170", {"entities": [(0, 19, "RECEIPT NUMBER")]}),
    ("TakeOut Total (incl GST) 26.60", {"entities": [(23, 28, "TOTAL AMOUNT")]}),
    ("Gerbang Alaf Restaurants", {"entities": [(0, 16, "ORG")]}),
    ("ORD#50-REG #19-18/01/2018 10:27:03", {"entities": [(15, 25, "DATE")]}),
    ("Document No TD01167104", {"entities": [(0, 21, "RECEIPT NUMBER")]}),
    ("Total: 9.00", {"entities": [(0, 11, "TOTAL AMOUNT")]}),
    ("Date: 25/12/2018 8:13:39 PM", {"entities": [(0, 16, "DATE")]}),
    ("INDAH GIFT & HOME IECO", {"entities": [(0, 21, "ORG")]}),
    ("19/10/2018 20:49:59 #01", {"entities": [(0, 10, "DATE")]}),
    ("TOTAL AMT. RM 60.31", {"entities": [(0, 20, "TOTAL AMOUNT")]}),
    ("TOTAL RM 33.92", {"entities": [(0, 15, "TOTAL AMOUNT")]}),
    ("Doc No CS00031663 Date 25/12/2018", {"entities": [(0, 17, "RECEIPT NUMBER"), (18, 32, "DATE")]}),
    ("Total Sales (Inclusive of GST) 80.90", {"entities": [(0, 35, "TOTAL AMOUNT")]}),
    ("YONGFATT ENTERPRISE", {"entities": [(0, 20, "ORG")]}),
    ("18-11-18 13:58 SH01Z153 T2 R000002902", {"entities": [(0, 8, "DATE")]}),
    ("TOTAL ROUNDED RM 30.90", {"entities": [(0, 21, "TOTAL AMOUNT")]}),
    ("ABC HO TRADING", {"entities": [(0, 14, "ORG")]}),
    ("Total Amount: 31.00", {"entities": [(0, 19, "TOTAL AMOUNT")]}),
    ("Date 09/01/2019 8:01:11 PM", {"entities": [(0, 15, "DATE")]}),
    ("SOON HUAT MACHINERY ENTERPRISE", {"entities": [(0, 30, "ORG")]}),
    ("Doc No. CS00004040 Date 11/01/2019", {"entities": [(0, 18, "RECEIPT NUMBER"), (19, 33, "DATE")]}),
    ("Total Sales 327.00", {"entities": [(0, 19, "TOTAL AMOUNT")]}),
    ("Receipt#: Cs00082552", {"entities": [(0, 20, "RECEIPT NUMBER")]}),
    ("Total (RM): 112.46", {"entities": [(0, 17, "TOTAL AMOUNT")]}),
    ("Salesperson: Date: 12/02/2018", {"entities": [(13, 29, "DATE")]}),
    ("SAM SAM TRADING CO", {"entities": [(0, 18, "ORG")]}),
    ("TOTAL RM 14.10", {"entities": [(0, 13, "TOTAL AMOUNT")]}),
    ("Friday 29-12-2017 Time: 20:17", {"entities": [(6, 16, "DATE")]}),
    ("Good Trust Trading LLC", {"entities": [(0, 22, "ORG")]}),
    ("Grand Total 2,538.968", {"entities": [(0, 21, "TOTAL AMOUNT")]}),
    ("In Words: Omani Rial Two Thousand Five Hundred Thirty Eight and 968/1000 Only", {"entities": [(9, 70, "TOTAL AMOUNT")]}),
    ("Fo Sood Trust Trading LLC", {"entities": [(0, 26, "ORG")]}),
    ("RESTORAN HASSAN BISTRO", {"entities": [(0, 21, "ORG")]}),
    ("Invoice No: 987654321012345", {"entities": [(0, 24, "RECEIPT NUMBER")]}),
    ("Final Amount (incl. Tax) 123.45", {"entities": [(22, 28, "TOTAL AMOUNT")]}),
    ("MegaMart Superstore", {"entities": [(0, 20, "ORG")]}),
    ("ORD#21-REG #05-10/03/2022 14:50:10", {"entities": [(15, 25, "DATE")]}),
    ("Document No: TX981723", {"entities": [(0, 22, "RECEIPT NUMBER")]}),
    ("Total: 150.75", {"entities": [(0, 13, "TOTAL AMOUNT")]}),
    ("Date: 08/11/2021 19:30:45", {"entities": [(0, 19, "DATE")]}),
    ("E-Mart Wholesale", {"entities": [(0, 17, "ORG")]}),
    ("15/09/2020 12:15:30 #05", {"entities": [(0, 10, "DATE")]}),
    ("TOTAL PAYABLE: 745.00", {"entities": [(0, 22, "TOTAL AMOUNT")]}),
    ("TOTAL DUE: 49.99", {"entities": [(0, 18, "TOTAL AMOUNT")]}),
    ("Doc No: AB123456 Date: 12/06/2019", {"entities": [(0, 18, "RECEIPT NUMBER"), (19, 29, "DATE")]}),
    ("Total (inclusive of Tax): 210.99", {"entities": [(0, 30, "TOTAL AMOUNT")]}),
    ("Global Traders Ltd.", {"entities": [(0, 21, "ORG")]}),
    ("22-08-2021 10:30 AM", {"entities": [(0, 10, "DATE")]}),
    ("TOTAL BALANCE: 1,150.40", {"entities": [(0, 25, "TOTAL AMOUNT")]}),
    ("XYZ Electronics", {"entities": [(0, 15, "ORG")]}),
    ("Grand Total: 980.50", {"entities": [(0, 18, "TOTAL AMOUNT")]}),
    ("Date 05/04/2022 16:45:12", {"entities": [(0, 19, "DATE")]}),
    ("Super Market Express", {"entities": [(0, 22, "ORG")]}),
    ("Doc No. XY98765432 Date: 14/07/2020", {"entities": [(0, 21, "RECEIPT NUMBER"), (22, 32, "DATE")]}),
    ("Total Purchase: 450.60", {"entities": [(0, 21, "TOTAL AMOUNT")]}),
    ("Receipt#: ZX321654987", {"entities": [(0, 21, "RECEIPT NUMBER")]}),
    ("Total (Amount): 189.75", {"entities": [(0, 20, "TOTAL AMOUNT")]}),
    ("Transaction Date: 03/12/2021", {"entities": [(19, 29, "DATE")]}),
    ("ABC & Co. Stores", {"entities": [(0, 18, "ORG")]}),
    ("TOTAL PAID: 22.90", {"entities": [(0, 18, "TOTAL AMOUNT")]}),
    ("Thursday 17-06-2021 Time: 18:45", {"entities": [(9, 19, "DATE")]}),
    ("Premium Shopping Mall", {"entities": [(0, 23, "ORG")]}),
    ("Total Bill: 3,999.99", {"entities": [(0, 19, "TOTAL AMOUNT")]}),
    ("Amount in Words: Three Thousand Nine Hundred Ninety-Nine and 99/100 Only", {"entities": [(16, 78, "TOTAL AMOUNT")]}),
    ("Bright Retailers Ltd.", {"entities": [(0, 24, "ORG")]}),
    ("RESTAURANT GOURMET DELIGHT", {"entities": [(0, 27, "ORG")]}),
    ("DATE OF PURCHASE: 30/12/2020", {"entities": [(19, 29, "DATE")]}),
]


In [68]:
import random
import spacy
from spacy.util import minibatch
from spacy.training.example import Example
from spacy_layout import spaCyLayout

OSError: [WinError 127] The specified procedure could not be found. Error loading "d:\InvoiceFieldExtraction\source\.venv\Lib\site-packages\torch\lib\shm.dll" or one of its dependencies.