In [115]:
import os
import json
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

from shapely.geometry import Polygon
import glob
from pytesseract import pytesseract
from lxml import etree
import ast
import torch
from PIL import ImageDraw, ImageFont, Image

from sklearn.model_selection import train_test_split
from datasets import load_dataset
pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
from transformers import LayoutLMv3ForTokenClassification, AutoProcessor, AutoModelForTokenClassification
from datasets.features import ClassLabel
import keras_ocr

pipeline = keras_ocr.pipeline.Pipeline()


Looking for C:\Users\Admin\.keras-ocr\craft_mlt_25k.h5

Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.

Looking for C:\Users\Admin\.keras-ocr\crnn_kurapan.h5


In [98]:
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

In [99]:
model = AutoModelForTokenClassification.from_pretrained('./test/checkpoint-1000')

In [100]:
train_file_path = './bill_dataset/train/data-00000-of-00001.arrow'
test_file_path = './bill_dataset/test/data-00000-of-00001.arrow'
dataset = load_dataset('arrow', data_files={'train': train_file_path, 'test': test_file_path})

In [113]:
def get_words_and_boxes(image):
    # Load the image

    regconize_image = keras_ocr.tools.read(image)
    prediction_groups = pipeline.recognize([regconize_image])

    # Initialize lists to hold words and their bounding boxes
    words = []
    boxes = []

    # Iterate over each data point
    for text, box in prediction_groups[0]:
        x1, y1 = min([coord[0] for coord in box]), min([coord[1] for coord in box])
        x2, y2 = max([coord[0] for coord in box]), max([coord[1] for coord in box])
        box = (x1, y1, x2, y2)
        words.append(text)
        boxes.append(box)

    return words, boxes

In [102]:
def unnormalize_box(bbox, width, height):
     return [
         int(width * (bbox[0] / 1000)),
         int(height * (bbox[1] / 1000)),
         int(width * (bbox[2] / 1000)),
         int(height * (bbox[3] / 1000)),
     ]

def normalize_bbox(bbox, size):
    return [
        int(1000 * bbox[0] / size[0]),
        int(1000 * bbox[1] / size[1]),
        int(1000 * bbox[2] / size[0]),
        int(1000 * bbox[3] / size[1]),
    ]

In [146]:
def convert_bounding_box_to_string(box):
    bounding_box_str = f"{box[0]},{box[1]},{box[2]},{box[3]}"
    return bounding_box_str

In [169]:
from PIL import Image
words_boxs_dict = {}
image_path = './gray_image/noise1.jpg'
image = Image.open(image_path).convert('RGB')
size = image.size
words, bboxes = get_words_and_boxes(image_path)

for item1, item2 in zip(words, bboxes):
    bound_str = convert_bounding_box_to_string(item2)
    words_boxs_dict[bound_str] = item1

boxes = [normalize_bbox(box, size) for box in bboxes]
labels = np.full(len(boxes), 0, dtype=int)
encoding = processor(image, words, boxes=boxes, word_labels=labels, return_tensors="pt")



In [170]:
with torch.no_grad():
    outputs = model(**encoding)

logits = outputs.logits
predictions = logits.argmax(-1).squeeze().tolist()
labels = encoding.labels.squeeze().tolist()

token_boxes = encoding.bbox.squeeze().tolist()
width, height = image.size

true_predictions = [model.config.id2label[pred] for pred, label in zip(predictions, labels) if label != - 100]
true_labels = [model.config.id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]

In [136]:
def extract_text_from_box(box):
    return words_boxs_dict[box]

In [58]:
label_config = './Label_Config.xml'
tree = etree.parse(label_config)
root = tree.getroot()
label2color = {label.get('value'): label.get('background') for label in root.findall(".//Label")}

In [29]:
def add_to_dict(label, text, coordinates, dictionary):
    # If the label exists, append the new tuple to the existing list
    if label in dictionary:
        dictionary[label].append((text, coordinates))
    else:
        # If the label does not exist, create a new key with a new list
        dictionary[label] = [(text, coordinates)]    

In [12]:
def calculate_iou(box_1, box_2):
    poly_1 = Polygon(box_1)
    poly_2 = Polygon(box_2)
    # print(poly_1,poly_2)
    # iou = poly_1.intersection(poly_2).area / poly_1.union(poly_2).area
    iou = poly_1.intersection(poly_2).area
    min_area = min(poly_1.area,poly_2.area)
    return iou/min_area

In [52]:
def get_ocr_box_overlapse_predict_box(ocr_boxes, predict_box, width, height):
    for box in ocr_boxes:
        try:
            (x1,y1,x2,y2) = box
            (x1p,y1p,x2p,y2p) = predict_box
            box1 = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
            box2 = [[x1p, y1p], [x2p, y1p], [x2p, y2p], [x1p, y2p]]
            overlap_perc = calculate_iou(box1,box2)
            if(overlap_perc > 0.80):
                return list(box)
        except Exception as e:
            print(e)

In [168]:
draw = ImageDraw.Draw(image)
width, height = image.size
font = ImageFont.load_default()

def iob_to_label(label):
    label = label
    if not label:
      return 'other'
    return label

mydict = {}

for prediction, box in zip(true_predictions, true_boxes):
    ocr_boxs = get_ocr_box_overlapse_predict_box(bboxes, box, width, height)
    text = ''
    if (ocr_boxs != None):
      text = words_boxs_dict[convert_bounding_box_to_string(ocr_boxs)]
    predicted_label = iob_to_label(prediction).lower()
    draw.rectangle(box, outline=label2color[predicted_label])
    draw.text((box[0] + 10, box[1] - 10), text=predicted_label, fill=label2color[predicted_label], font=font)
    add_to_dict(predicted_label, text, ocr_boxs, mydict)

image.show()

In [157]:
print(mydict['company_info'])

[('shell', [1459.7988, 58.254883, 1535.1875, 95.94922]), ('uk', [1658.5508, 58.254883, 1709.9521, 95.94922]), ('limited', [1713.3789, 58.254883, 1833.3154, 95.94922]), ('elder', [1836.7422, 58.254883, 1918.9844, 95.94922]), ('586512', [2038.9209, 58.254883, 2179.418, 95.94922]), ('elder', [2186.2715, 58.254883, 2271.9404, 95.94922]), ('gater', [2265.7197, 54.61725, 2368.5225, 103.17178]), ('energy', [1540.0719, 60.602505, 1653.0698, 104.932625]), ('houses', [1924.1887, 60.314358, 2033.7509, 100.059616]), ('mka', [2220.539, 99.37598, 2299.3545, 137.07031]), ('milton', [1984.0928, 102.802734, 2083.4688, 137.07031]), ('keynes', [2087.8599, 100.93954, 2212.645, 150.16718]), ('ilrs', [2306.208, 102.802734, 2364.463, 137.07031])]


In [163]:
company_info = mydict['company_info']
site_address = mydict['site_address']
paragraph = ' '.join(text for text, _ in company_info if text!='')

In [159]:
box_heights = [box[3] - box[1] for _, box in company_info]
average_line_height = sum(box_heights) / len(box_heights) if box_heights else 0
print(f"Average Line Height: {average_line_height}")

Average Line Height: 39.55792764516977


In [164]:
def get_line_number(y_coord):
    return int(y_coord / average_line_height)

# Sort by Y-coordinate, and then by X-coordinate within each line
sorted_data = sorted(site_address, key=lambda x: (get_line_number(x[1][1]), x[1][0]))

# Concatenate the text to form a paragraph
paragraph = ' '.join(text for text, _ in sorted_data)

In [165]:
paragraph

'barking first floor flat 740 road london greater london e13 9lb'