In [1]:
import cv2
import numpy as np
import uuid
import json
import os
from paddleocr import PaddleOCR
from ultralytics import YOLO
from transformers import pipeline

In [2]:
# Initialize PaddleOCR
ocr = PaddleOCR(lang='en', rec_image_shape="3,32,100", rec_batch_num=1, max_text_length=25, rec_algorithm='CRNN', use_gpu=True, rec_model_dir='./models/ppocr_mobile_v4.0_rec_infer/')
YoloModel = YOLO("./models/best-071024-5.pt")


[2024/10/08 17:17:32] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/gru/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='CRNN', rec_model_dir='./models/ppocr_mobile_v4.0_rec_infer/', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=1, max_text_length=25, rec_char_dict_path=

In [3]:
labels = ['Detailed', 'EmptyInput', 'TableColumn', 'boxInput', 'checkBox', 'lineInput', 'signature']

In [4]:
# Load image
img_path = './images/NEFT.jpg'
img = cv2.imread(img_path)

# Extract image name without extension
image_name = os.path.splitext(os.path.basename(img_path))[0]

# Create a directory for output if it doesn't exist
output_dir = f'./output/{image_name}/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [5]:
# Extract text and bounding boxes
result = ocr.ocr(img_path)

# Function to convert quadrilateral bbox to rectangular bbox
def convert_quad_to_rect(quad_bbox):
    x_coords = [point[0] for point in quad_bbox]
    y_coords = [point[1] for point in quad_bbox]
    x_min = min(x_coords)
    y_min = min(y_coords)
    x_max = max(x_coords)
    y_max = max(y_coords)
    return [x_min, y_min, x_max, y_max]

# Create a list to hold the results
output = []

# Draw bounding boxes on the image and build the JSON
for line in result:
    for word_info in line:
        quad_bbox = word_info[0]  # Quadrilateral bounding box coordinates
        text = word_info[1][0]  # Recognized text
        confidence = word_info[1][1]  # Confidence level

        # Convert bbox coordinates to integers
        quad_bbox = [[int(coord[0]), int(coord[1])] for coord in quad_bbox]
        
        # Convert quadrilateral bbox to rectangular format [x_min, y_min, x_max, y_max]
        rect_bbox = convert_quad_to_rect(quad_bbox)

        # Draw the rectangular bounding box on the image
        cv2.rectangle(img, (rect_bbox[0], rect_bbox[1]), (rect_bbox[2], rect_bbox[3]), (0, 255, 0), 2)

        # Put the recognized text near the bounding box
        # cv2.putText(img, text, (rect_bbox[0], rect_bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

        # Generate a unique identifier for each detected word
        entry_uuid = str(uuid.uuid4())

        # Create a dictionary entry for this detected text
        output.append({
            "uuid": entry_uuid,
            "class": "Label",
            "confidence": confidence,
            "bbox": rect_bbox,  # Store rectangular bbox
            "text": text,
            "child": [], 
        })

# Optionally, save the output image with bounding boxes
output_image_path = f'{output_dir}{image_name}_paddleOCR.jpg'
cv2.imwrite(output_image_path, img)

# Save the JSON output to a file with 'label' appended to the name
output_json_path = f'{output_dir}{image_name}_paddleOCR.json'
with open(output_json_path, 'w') as json_file:
    json.dump(output, json_file, indent=4)

print(f"Annotated image saved to: {output_image_path}")
print(f"JSON file saved to: {output_json_path}")


[2024/10/08 17:17:33] ppocr DEBUG: dt_boxes num : 65, elapsed : 0.2093057632446289
[2024/10/08 17:17:39] ppocr DEBUG: rec_res num  : 65, elapsed : 5.296661138534546
Annotated image saved to: ./output/NEFT/NEFT_paddleOCR.jpg
JSON file saved to: ./output/NEFT/NEFT_paddleOCR.json


In [6]:
# Load the DistilBERT NER pipeline

# ner_pipeline = pipeline("ner", model="dbmdz/distilbert-base-uncased-finetuned-conll03-english", aggregation_strategy="simple", token="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")

In [7]:
# # Initialize NER pipeline
# ner = pipeline("ner", model="dslim/bert-base-NER", device=0)

# # Function to convert quadrilateral bbox to rectangular bbox
# def convert_quad_to_rect(quad_bbox):
#     x_coords = [point[0] for point in quad_bbox]
#     y_coords = [point[1] for point in quad_bbox]
#     x_min = min(x_coords)
#     y_min = min(y_coords)
#     x_max = max(x_coords)
#     y_max = max(y_coords)
#     return [x_min, y_min, x_max, y_max]

# result = ocr.ocr(img_path)

# # Create a list to hold the results
# output = []

# # Process OCR results
# for line in result:
#     for word_info in line:
#         quad_bbox = word_info[0]
#         text = word_info[1][0]
#         confidence = word_info[1][1]

#         # Convert bbox coordinates to integers
#         quad_bbox = [[int(coord[0]), int(coord[1])] for coord in quad_bbox]
        
#         # Convert quadrilateral bbox to rectangular format
#         rect_bbox = convert_quad_to_rect(quad_bbox)

#         # Draw the rectangular bounding box on the image
#         cv2.rectangle(img, (rect_bbox[0], rect_bbox[1]), (rect_bbox[2], rect_bbox[3]), (0, 255, 0), 2)

#         # Perform NER on the text
#         ner_results = ner(text)
        
#         # Only include entries with recognized entities
#         if ner_results and ner_results[0]['entity'] != 'O':
#             entry_uuid = str(uuid.uuid4())
#             output.append({
#                 "uuid": entry_uuid,
#                 "class": ner_results[0]['entity'],
#                 "confidence": confidence,
#                 "bbox": rect_bbox,
#                 "text": text,
#                 "child": [],
#             })

# # Save the output image with bounding boxes
# output_image_path = 'output_paddleOCR.jpg'
# cv2.imwrite(output_image_path, img)

# # Save the JSON output to a file
# output_json_path = 'output_paddleOCR.json'
# with open(output_json_path, 'w') as json_file:
#     json.dump(output, json_file, indent=4)

# print(f"Annotated image saved to: {output_image_path}")
# print(f"JSON file saved to: {output_json_path}")

In [8]:
# Perform prediction
results = YoloModel.predict(source=img_path, save=True, conf=0.25, device=0)

# Prepare a list to store JSON data
json_data = []

# Loop through the results and draw bounding boxes
for result in results:
    for box in result.boxes:
        # Extract bbox coordinates
        bbox = box.xyxy[0].cpu().numpy().astype(int)  # [x1, y1, x2, y2]

        # Extract other details
        class_id = int(box.cls.cpu().numpy())  # Class ID
        confidence = float(box.conf.cpu().numpy())  # Confidence score
        entry_uuid = str(uuid.uuid4())  # Unique identifier

        # Draw the bounding box on the image
        cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
        # cv2.putText(img, f"Class {class_id} ({confidence:.2f})", (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        # Append data to JSON
        json_data.append({
            "uuid": entry_uuid,
            "class": labels[class_id],
            "confidence": confidence,
            "bbox": bbox.tolist(),
            "text": "",
            "parent": ""
        })

# Save the annotated image
output_image_path = os.path.join(output_dir, f"{image_name}_yolo.jpg")
cv2.imwrite(output_image_path, img)

# Save the JSON data
output_json_path = os.path.join(output_dir, f"{image_name}_yolo.json")
with open(output_json_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

print(f"Annotated image saved to: {output_image_path}")
print(f"JSON file saved to: {output_json_path}")


image 1/1 /mnt/d/Sem5/DL/DL-Project/Code/Guru/combine-label-input/images/NEFT.jpg: 640x480 29 boxInputs, 1 checkBox, 15 lineInputs, 4 signatures, 90.2ms
Speed: 8.7ms preprocess, 90.2ms inference, 194.1ms postprocess per image at shape (1, 3, 640, 480)
Results saved to [1mruns/detect/predict3[0m
Annotated image saved to: ./output/NEFT/NEFT_yolo.jpg
JSON file saved to: ./output/NEFT/NEFT_yolo.json


: 