In [1]:
from paddleocr import PaddleOCR, draw_ocr
import cv2
import numpy as np
import re
import os
from pdf2image import convert_from_path



**MERGE RESULTS ON THE SAME LINE**

In [2]:
def merge_ocr_results(ocr_results, y_threshold=20):

    # Sort OCR results
    ocr_results.sort(key=lambda item: (item[0][0][1], item[0][0][0]))

    # print("*************Debugging***********")
    # for idx in range(len(ocr_results)):
    #     res = ocr_results[idx]
    #     for line in res:
    #         print(line)

    merged_result = []
    for idx in range(len(ocr_results)):
        res = ocr_results[idx]
        for box, (text, conf) in res:
            _, y1 = box[0]
            # x2, _ = box[1]

            if not merged_result:
                merged_result.append([box, (text, conf)])
                continue

            prev_box, (prev_text, prev_conf) = merged_result[-1]
            _, prev_y1 = prev_box[0]

            # print(f"DEBUG : x1 - prev_x1 = {x1 - prev_x1}")
            # print(f"DEBUG : y1 - prev_y1 = {y1 - prev_y1}")

            if abs(y1 - prev_y1) < y_threshold:
                merged_txt = prev_text + " " + text
                avg_conf = (conf + prev_conf) / 2
                prev_box[1], prev_box[2] = box[1], box[2]
                merged_result[-1] = [prev_box, (merged_txt, avg_conf)]
            else:
                merged_result.append([box, (text, conf)])

    return merged_result

**EXTRACT FIELDS USING REGEX PATTERNS**

In [None]:
def extract_fields(merged_result: list):

    date, receipt_no, total_amt, store_name = None, None, None, None

    date_pattern = re.compile(
        r"\b(?:Date[:\s]*)?(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b|Date[:\s]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        re.IGNORECASE,
    )
    # date_pattern = re.compile(r'\b(?:Date[:\s]*)?(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b', re.IGNORECASE)
    total_amount_pattern = re.compile(
        r"TOTAL(?: AMOUNT|AMT\.?|:)?\s*(?:RM|USD|\$)?\s*(\d+\.\d{2})", re.IGNORECASE
    )
    receipt_no_pattern = re.compile(
        r"(?:Receipt No|Invoice No|Invoice#|Inv#|Bill No|Document No|Room No|Doc No).*?(\S+)",
        re.IGNORECASE
    )
    store_name_keywords = ["HOME", "STORE", "SHOP", "MARKET", "GIFT", "MART", "RETAIL"]

    for entry in merged_result:
        text = entry[1][0].strip()

        # Date
        if not date:
            date_match = date_pattern.search(text)
            if date_match:
                date = date_match.group()
                date = date.lstrip("Date")

        # Total Amount
        if not total_amt:
            total_match = total_amount_pattern.search(text)
            if total_match:
                total_amt = total_match.group(1)

        # Extract Receipt No.
        if not receipt_no:
            receipt_match = receipt_no_pattern.search(text)
            if receipt_match:
                receipt_no = receipt_match.group(1)

        # Extract Store Name
        if not store_name:
            if any(keyword in text.upper() for keyword in store_name_keywords):
                store_name = text

    return {
        "Receipt No": receipt_no,
        "Date": date,
        "Total Amount": total_amt,
        "Store Name": store_name,
    }

In [4]:
ocr = PaddleOCR(use_angle_cls=True, lang="en")

[2025/02/28 16:14:27] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ADMIN/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ADMIN/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

**CONVERT PDFs TO IMAGES**

In [5]:
def pdf2jpg(file_path):
    images = convert_from_path(file_path, poppler_path=r"C:\Users\ADMIN\poppler-24.08.0\Library\bin")
    
    for i, image in enumerate(images):
        img_filename = f"{os.path.splitext(os.path.split(file_path)[1])[0]}.jpg"
        save_path = f'train_datasets/{img_filename}'
        image.save(save_path, 'JPEG')

**EXPERIMENTAL**

In [None]:
def ProcessImage(img):
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # De-noising
    denoised = cv2.fastNlMeansDenoising(gray_img, h=10)
    # Binary Thresholding
    _, thresh = cv2.threshold(denoised, 240, 255, cv2.THRESH_BINARY)
    # Adaptive Thresholding
    thresh1 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 2)
    thresh1_1 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 55, 3)
    thresh2 = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 55, 5)
    
    cv2.imshow("Thresh1_1", thresh1_1)
    cv2.imshow('Blurred', thresh1)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
    
img = cv2.imread('./train_datasets/X51005441401.jpg')
ProcessImage(img)

**MAIN FUNCTION**

In [9]:
file_path = "./train_datasets/X51007339146.jpg"
if os.path.splitext(file_path)[1] == ".pdf":
    pdf2jpg(file_path)
    new_file = f"{os.path.splitext(os.path.split(file_path)[1])[0]}.jpg"
    # file_path = os.path.join('./train_datasets', new_file)
    file_path = f"train_datasets/{new_file}"
image = cv2.imread(file_path)

result = ocr.ocr(file_path, cls=True)
merged_result = merge_ocr_results(result)

for res in merged_result:
    print(res)
        
extracted_fields = extract_fields(merged_result)
print(extracted_fields)
    
boxes = [res[0] for res in merged_result]
texts = [res[1][0] for res in merged_result]
scores = [res[1][1] for res in merged_result]
    
# Draw OCR results on the image
image_with_boxes = draw_ocr(image, boxes, texts, scores, font_path="C:\Windows\Fonts\Arial.ttf")

# Convert to OpenCV format
image_with_boxes = cv2.cvtColor(np.array(image_with_boxes), cv2.COLOR_RGB2BGR)
    
cv2.imshow("OCR Results", image_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()

[2025/02/28 16:28:05] ppocr DEBUG: dt_boxes num : 50, elapsed : 0.8498406410217285
[2025/02/28 16:28:05] ppocr DEBUG: cls num  : 50, elapsed : 0.5440254211425781
[2025/02/28 16:28:52] ppocr DEBUG: rec_res num  : 50, elapsed : 46.92593574523926
[[[52.0, 131.0], [514.0, 131.0], [514.0, 163.0], [52.0, 163.0]], ('SANYUSTATIONERYSHOP', 0.9892673492431641)]
[[[47.0, 177.0], [505.0, 181.0], [504.0, 203.0], [47.0, 200.0]], ('NO.31G&33GJALAN SETIA INDAH XU13/X', 0.9256612062454224)]
[[[47.0, 200.0], [247.0, 203.0], [246.0, 226.0], [47.0, 222.0]], ('40170SETIA ALAM', 0.9318690896034241)]
[[[47.0, 229.0], [434.0, 232.0], [433.0, 255.0], [47.0, 251.0]], ('Mobile/Whatsapps:+6012-9187937', 0.9704955220222473)]
[[[47.0, 258.0], [268.0, 259.0], [267.0, 282.0], [47.0, 280.0]], ('Tel:+603-33624137', 0.9662122130393982)]
[[[48.0, 287.0], [595.0, 298.0], [594.0, 327.0], [48.0, 308.0]], ('GSTIDNo:001531760640 TAX INVOICE', 0.9647992551326752)]
[[[39.0, 314.0], [145.0, 314.0], [145.0, 337.0], [39.0, 337.0]]

In [10]:
# Getting indices of each character in string
s = "Total Sales Inclusive GST @6% 16.50"
print(f"Length of string : {len(s)}")
for i, char in enumerate(s):
    print(f"char: {char} | index : {i}")

Length of string : 35
char: T | index : 0
char: o | index : 1
char: t | index : 2
char: a | index : 3
char: l | index : 4
char:   | index : 5
char: S | index : 6
char: a | index : 7
char: l | index : 8
char: e | index : 9
char: s | index : 10
char:   | index : 11
char: I | index : 12
char: n | index : 13
char: c | index : 14
char: l | index : 15
char: u | index : 16
char: s | index : 17
char: i | index : 18
char: v | index : 19
char: e | index : 20
char:   | index : 21
char: G | index : 22
char: S | index : 23
char: T | index : 24
char:   | index : 25
char: @ | index : 26
char: 6 | index : 27
char: % | index : 28
char:   | index : 29
char: 1 | index : 30
char: 6 | index : 31
char: . | index : 32
char: 5 | index : 33
char: 0 | index : 34


In [None]:
############# Testing ##################
file = './pdfs/invc_1[1].pdf'
print(os.path.splitext(os.path.split(file)[1])[0])

invc_1[1]
