In [None]:
import os
import json
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

from shapely.geometry import Polygon
import glob
from pytesseract import pytesseract
from lxml import etree
import ast
import torch
from PIL import ImageDraw, ImageFont, Image
import cv2

from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, ClassLabel, Sequence, Value, load_dataset
from datasets import Image as DImage
pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
from lxml import etree
import keras_ocr

pipeline = keras_ocr.pipeline.Pipeline()

In [None]:
labelpath = './labeldata/project-4-at-2023-12-18-22-49-43cf6b2d.json'
f = open(labelpath)
label_studio_data = json.load(f)

In [None]:
label_config = './Label_Config.xml'
tree = etree.parse(label_config)
root = tree.getroot()
conf_labels = [label.get('value') for label in root.findall(".//Label")]

In [None]:
def preprocess_image(path, name):
    image = cv2.imread(path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
    threshold_image = cv2.adaptiveThreshold(
        blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    save_path = './gray_image/' + name
    cv2.imwrite(save_path, threshold_image)
    return save_path

In [None]:
def calculate_iou(box_1, box_2):
    poly_1 = Polygon(box_1)
    poly_2 = Polygon(box_2)
    # print(poly_1,poly_2)
    # iou = poly_1.intersection(poly_2).area / poly_1.union(poly_2).area
    iou = poly_1.intersection(poly_2).area
    min_area = min(poly_1.area,poly_2.area)
    return iou/min_area
    
    
def hocr_to_dataframe(fp):
    doc = etree.parse(fp)
    words = []
    wordConf = []
    coords_list = []
    for path in doc.xpath('//*'):
        if 'ocrx_word' in path.values():
            coord_text = path.values()[2].split(';')[0].split(' ')[1:] 
            word_coord = list(map(int, coord_text)) #x1, y1, x2, y2
            conf = [x for x in path.values() if 'x_wconf' in x][0]
            wordConf.append(int(conf.split('x_wconf ')[1]))
            words.append(path.text)
            coords_list.append(word_coord)

    dfReturn = pd.DataFrame({'word' : words,
                             'coords': coords_list,
                             'confidence' : wordConf})

    return(dfReturn)

In [None]:
document_data = dict()
document_data['file_name'] = []
document_data['labelled_bbox']= []

for i in range(len(label_studio_data)):
    row = label_studio_data[i]
    upload_file_name = os.path.basename(row['data']['image'])
    parts = upload_file_name.split('-')
    file_name = '-'.join(parts[1:])
    label_list, labels, bboxes = [], [], []

    for label_ in row['annotations'][0]['result']:
        label_value = label_['value']
        x, y, w, h = label_value['x'], label_value['y'], label_value['width'], label_value['height']
        original_w , original_h = label_['original_width'], label_['original_height']

        x1 = int((x * original_w) / 100)
        y1 = int((y * original_h) / 100)
        x2 = x1 + int(original_w*w / 100)
        y2 = y1 + int(original_h*h / 100)
        
        label = label_value['rectanglelabels']
        label_list.append((label, (x1,y1,x2,y2), original_h, original_w))
        
    document_data['file_name'].append(file_name)    
    document_data['labelled_bbox'].append(label_list)        

custom_dataset = pd.DataFrame(document_data)

In [None]:
label2id = {key: index for index, key in enumerate(conf_labels)}
id2label = {v:k for k, v in label2id.items()}

In [None]:
from PIL import Image
def load_image(image_path):
    image = Image.open(image_path).convert("RGB")
    w, h = image.size
    return image, (w,h)

def normalize_bbox(bbox, size):
    return [
        int(1000 * bbox[0] / size[0]),
        int(1000 * bbox[1] / size[1]),
        int(1000 * bbox[2] / size[0]),
        int(1000 * bbox[3] / size[1]),
    ]

In [None]:
def extract_text_from_bounding_box(image_path, box):
    extract_img = Image.open(image_path)
    basename = os.path.basename(image_path)
    cropped_image = extract_img.crop(box)
    cropped_path = './cropped_image/' + basename
    cropped_image.save(cropped_path)
    cropped_image_array = keras_ocr.tools.read(cropped_path)
    prediction_group = pipeline.recognize([cropped_image_array])
    detected_texts = [word for word, box in prediction_group[0]]
    paragraph = ' '.join(detected_texts)
    return paragraph 

In [None]:
# process label data - method 2
final_list2 = []
   
for i in tqdm(custom_dataset.iterrows(), total=custom_dataset.shape[0]):
    custom_label_text = {}
    word_list = []
    ner_tags_list  = []
    bboxes_list = []
    
    file_name = i[1]['file_name']
    for image in glob.glob('./image_bill/*.jpg'): #Make sure you add your extension or change it based on your needs 
        frame_file_name = os.path.basename(image)
        if frame_file_name == file_name:
            custom_label_text['id'] = i[0]
            image_basename = os.path.basename(image)
            annotations = []
            # process image
            gray_image = preprocess_image(image, image_basename)
            label_coord_list = i[1]['labelled_bbox']

            regconize_image = keras_ocr.tools.read(gray_image)
            prediction_groups = pipeline.recognize([regconize_image])

            for label_coord in label_coord_list:
                (x1,y1,x2,y2) = label_coord[1]
                box1 = [x1,y1,x2,y2] 
                label = label_coord[0][0]
                extract_text = extract_text_from_bounding_box(gray_image, box1)
                ner_tags = label2id[label]
                bboxes_list.append(box1)
                ner_tags_list.append(ner_tags)
                word_list.append(extract_text)
            dp_image, size = load_image(gray_image)
            custom_label_text['image'] = dp_image
            custom_label_text['words'] = word_list
            custom_label_text['bboxes'] = [normalize_bbox(box, size) for box in bboxes_list]
            custom_label_text['ner_tags'] = ner_tags_list
    final_list2.append(custom_label_text)

In [None]:
final_list = []
   
for i in tqdm(custom_dataset.iterrows(), total=custom_dataset.shape[0]):
    custom_label_text = {}
    word_list = []
    ner_tags_list  = []
    bboxes_list = []
    
    file_name = i[1]['file_name']
    for image in glob.glob('./image_bill/*.jpg'): #Make sure you add your extension or change it based on your needs 
        frame_file_name = os.path.basename(image)
        if frame_file_name == file_name:
            custom_label_text['id'] = i[0]
            image_basename = os.path.basename(image)
            annotations = []
            # process image
            gray_image = preprocess_image(image, image_basename)
            label_coord_list = i[1]['labelled_bbox']

            regconize_image = keras_ocr.tools.read(gray_image)
            prediction_groups = pipeline.recognize([regconize_image])

            for text, box in prediction_groups[0]:
                for label_coord in label_coord_list:
                    (x1,y1,x2,y2) = label_coord[1]
                    box1 = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] 
                    label = label_coord[0][0]
                    box2 = box
                    words = text
                    overlap_perc = calculate_iou(box1,box2)
                    if overlap_perc > 0.80:
                        if words != '-':
                            word_list.append(words)
                            x1, y1 = min([coord[0] for coord in box]), min([coord[1] for coord in box])
                            x2, y2 = max([coord[0] for coord in box]), max([coord[1] for coord in box])
                            format_box = [x1, y1, x2, y2]
                            bboxes_list.append(format_box)
                            label_id = label2id[label]                              
                            ner_tags_list.append(label_id)
                        dp_image, size = load_image(gray_image)
                        custom_label_text['image'] = dp_image
                        custom_label_text['words'] = word_list
                        custom_label_text['bboxes'] = [normalize_bbox(box, size) for box in bboxes_list]
                        custom_label_text['ner_tags'] = ner_tags_list
    final_list.append(custom_label_text)
                        

In [41]:
features = Features({
    'id': Value('int64'),  # or 'string' if your ID is a string
    'image': DImage(),  # Assuming image paths or URLs as strings
    'words': Sequence(Value('string')),
    'bboxes': Sequence(Sequence(Value('int64'))),  # Nested sequence for bounding box coordinates
    'ner_tags': Sequence(ClassLabel(names=conf_labels)),  # Update with your actual NER tags
})
dataset = Dataset.from_list(final_list2, features=features)
# dataset = Dataset.from_list(final_list, features=features)
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
def unnormalize_box(bbox, width, height):
    return [
        int(width * (bbox[0] / 1000)),
        int(height * (bbox[1] / 1000)),
        int(width * (bbox[2] / 1000)),
        int(height * (bbox[3] / 1000)),
    ]
example = dataset['train'][0]
image = example['image']
draw = ImageDraw.Draw(image)
width, height = image.size
for box in example['bboxes']:
    draw.rectangle(unnormalize_box(box,width,height), width=4, outline='red')
image.show()

In [42]:
save_directory = './bill_dataset2'
dataset.save_to_disk(save_directory)

Saving the dataset (0/1 shards):   0%|          | 0/7 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3 [00:00<?, ? examples/s]

In [None]:
# !pip install -q git+https://github.com/huggingface/transformers.git
# !pip install -q datasets seqeval
# !pip install accelerate -U

In [None]:
# Load an image
image_path = './gray_image/0001.jpg'
image = keras_ocr.tools.read(image_path)

# Each result is a tuple (box, text)
predictions = pipeline.recognize([image])
dp_image2 = Image.open(image_path)
draw = ImageDraw.Draw(dp_image2)
for text, box in predictions[0]:
    x1, y1 = min([coord[0] for coord in box]), min([coord[1] for coord in box])
    x2, y2 = max([coord[0] for coord in box]), max([coord[1] for coord in box])
    box = (x1, y1, x2, y2)
    draw.rectangle(box, outline='red', width=1)
dp_image2.show()

In [None]:
example = dataset['train'][3]
image = example['image']
draw = ImageDraw.Draw(image)
width, height = image.size
font = ImageFont.load_default()
i = 0
for i in range(0, len(example['bboxes'])):
    print(id2label[example['ner_tags'][i]])
    box = unnormalize_box(example['bboxes'][i],width,height)
    draw.rectangle(box, width=1, outline='red')
    draw.text((box[0] + 10, box[1] - 10), text=id2label[example['ner_tags'][i]], fill ='green', font=font)
image.show()
    

In [None]:
noise_path1 = './Image_bill/noise2.jpg'
preprocess_image(noise_path1, 'noise2.jpg')
