In [1]:
from openai import OpenAI
import json 
import re
entities_list = """Issuing Company Name",
        "Issuing Company Address",
        "Recipient Company Name",
        "Recipient Company Address",
        "Cargo Description",
        "Cargo Quantity",
        "Cargo Weight",
        "Cargo Volume",
        "Cargo Value",
        "Cargo Dimensions",
        "Cargo Type",
        "Shipment Date",
        "Delivery Date",
        "Departure Location",
        "Arrival Location",
        "Route Information",
        "Mode of Transport",
        "Carrier Name",
        "Sender Name",
        "Sender Address",
        "Receiver Name",
        "Receiver Address",
        "Customs Declaration Number",
        "Customs Agent Name",
        "Customs Duties Paid",
        "Customs Clearance Status",
        "Total Amount",
        "Currency",
        "Payment Terms",
        "Invoice Number",
        "Insurance Details"""


# entities_list = "Passport Number, Surname, Given Name(s), Nationality, Date of Birth, Place of Birth, Sex, Place of Issue, Date of Issue, Date of Expiry"

# entities_list = "Bill of Lading Number, Shipper Name, Shipper Address, Consignee Address, Carrier Name, Carrier Address"

file_type = "pdf"
file_mode = "vision" # custom/vision
with open("Other/new/json/109849.json", "r", encoding="utf-8") as json_file:
    json_data_temp = json.loads(json_file.read())

json_data_new_json = {}
json_data_new_json["responses"] = []
if file_mode == "custom":
    for k, v in json_data_temp["ocr_data"].items():

        json_data_new_json["responses"].append(v)
        json_data_new_json["responses"][int(k) - 1]["context"] = {
				"pageNumber": int(k),
				"uri": ""
			}

json_data_temp = json_data_new_json

with open("pre_file.json", "w") as f:
        json.dump(json_data_temp
        , f, indent=4)

In [2]:
detected_break_types = {
    0: ' ',        # An unknown break type.
    1: ' ',          # A space between words.
    2: ' ',     # A sure space (more confidently detected).
    3: ' ', # A sure end-of-line space.
    4: '-',         # A hyphenated break.
    5: ' '      # A line break, similar to a new line (\n).
}

def validate_json(json_data):
    
    expected_keys = {'entities'}
    
    if not set(json_data.keys()) == expected_keys:
        return False
    
    if not isinstance(json_data['entities'], list):
        return False
    
    for entity in json_data['entities']:
        if not isinstance(entity, dict) or 'type' not in entity or 'entity' not in entity:
            return False

    return True

def get_response_from_openai(prompt):

    api_key = "<you open ai key>"

    client = OpenAI(
        api_key=api_key,
    )

    chat_completion = client.chat.completions.create(

        messages=[
            {
                "role" : "system",
                "content" : """You are a highly specialized Named Entity Recognition (NER) model designed to meticulously process textual inputs, and identify specified entities within the provided text. Your primary function is to return a JSON response encapsulating the entities in the text.

                            Your task is to meticulously traverse the input string, and identify specified entities. 

                            Emphasize that the response should exclusively comprise the JSON structure delineating the identified entities and the type of the entity, devoid of any extraneous information.

                            Also, keep in mind to only return the entity as is from the string and not modify it whatsoever. It is okay to include more than one entity of the same type. Always give me full words and not part of the words from the content.

                            If an entity is not found, do not give any entity text for it like "Not Found" or "Not Provided". Simply ignore it.
                            Here's an illustrative example to guide your response:

                            Examples of string and list of entities:

                            1. 
                            string: आयकर विभाग\nINCOME TAX DEPARTMENT\nसत्यमेव जयत\nMONIKA MAHADEV SHINDE\nMAHADEV SHINDE\n31/10/1992\nPermanent Account Number\nEJAPS0276M\nMONIKA MSHINDE\nSignature\nभारत सरकार\nGOVT. OF INDIA\nIR\n17092012.
                            entities: Full Name, PAN number, Date of Birth, Father Name

                            response:
                            {
                                "entities": [
                                    {
                                        "type": "Full Name",
                                        "entity": "MONIKA MAHADEV SHINDE"
                                    },
                                    {
                                        "type": "PAN number",
                                        "entity": "EJAPS0276M"
                                    },
                                    {
                                        "type": "Date of Birth",
                                        "entity": "31/10/1992"
                                    },
                                    {
                                        "type": "Father Name",
                                        "entity": "MAHADEV SHINDE"
                                    }
                                ]
                            }

                            2. 
                            string: आयकर विभाग\nINCOME TAX DEPARTMENT\nSONI KINNARIBEN S\nCHANDRAKANT JAYANTILAL SONI\n01/10/1980\nPermanent Account Number\nBFNPS1414K\nK.s.Soni\nSignature\nभारत सरकार\nGOVT. OF INDIA\n30052006
                            entities: Full Name, PAN number, Date of Birth, Father Name

                            response:
                            {
                                "entities": [
                                    {
                                        "type": "Full Name",
                                        "entity": "SONI KINNARIBEN S"
                                    },
                                    {
                                        "type": "PAN number",
                                        "entity": "BFNPS1414K"
                                    },
                                    {
                                        "type": "Date of Birth",
                                        "entity": "01/10/1980"
                                    },
                                    {
                                        "type": "Father Name",
                                        "entity": "CHANDRAKANT JAYANTILAL SONI"
                                    }
                                ]
                            }"""
            },
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-4",
        temperature=0
    )

    return chat_completion.choices[0].message.content 


def process_input(input_str, entities_list):
    words = re.findall(r'\S+', input_str)
    chunks = []
    chunk = ''
    print("Length of document:", len(input_str.split(" ")))
    for word in words:
        if len(chunk.split(" ")) <= max_chunk_length:
            chunk += ' ' + word if chunk else word
        else:
            chunks.append(chunk)
            chunk = word
        
    if chunk:
        chunks.append(chunk)
        
    output = {"entities": []}
    count = 0
    print("Chunks length", len(chunks))
    for chunk in chunks:
        count += 1
        print("Processing chunk of length", len(chunk.strip().split(" ")))
        for retry in range(retries):
            try:
                prompt = """Extract all the entities from the string delimited by triple square brackets [[[""" + chunk + """]]] and the entities: """ + str(entities_list)
                chunk_output = get_response_from_openai(prompt)
                
                pattern = r'{\s*"entities"\s*:\s*\[\s*(?:{[^{}]*}|[^[\]{}]*|\s)*\s*\]\s*}'
                json_matches = re.findall(pattern, chunk_output)

                # Extract the JSON part
                if json_matches:
                    chunk_output = json_matches[0]
                else:
                    print("No JSON found in the string.")
                try:
                    chunk_output = json.loads(chunk_output)
                except:
                    try:
                        chunk_output = json.loads(chunk_output.split("```")[1])
                    except:
                        continue
                output["entities"].extend(chunk_output["entities"])
                print("Processed chunk of length", len(chunk.split(" ")))
                break
            except Exception as e:
                print(str(e))
                print("chuck process failed. retrying..")
                continue
    return output

c = []
count = 1000
retries = 5
max_chunk_length = 1000


In [3]:
def preprocess_word(word):
    return re.sub(r'[^a-zA-Z0-9\s]', '', word).lower()

def insert_list_to_list(main_list, sub_list, index):
    return main_list[:index] + sub_list + main_list[index:]

def split_string(s):
    
    initial_pattern = r'[:\s]+|(?<=\.)\s+|[\(\)]' 
    initial_split = re.split(initial_pattern, s)

    result = []
    
    detailed_pattern = r'(?<!\d)-(?!\d)'
    initial_split_temp = initial_split.copy()
    inserted_count = 0
    
    for i, s in enumerate(initial_split_temp):

        parts = re.split(r'[-,]', s)

        if len(parts) > 1:
            
            for part in parts:
                if preprocess_string(part): 
                    if not part.isdigit():
                        
                        initial_split = insert_list_to_list(initial_split, parts, i + inserted_count)
                        inserted_count += len(parts) - 1
                        if initial_split.index(s) != -1:
                            del initial_split[initial_split.index(s)]
                        break
                    
    return [word.strip() for word in initial_split if word.strip()]

def abs_to_norm(norm_vertices, width, height):
    
    return {
        "vertices": [
            {
                "x": norm_vertices[0]["x"] / width,
                "y": norm_vertices[0]["y"] / height,
            },
            {
                "x": norm_vertices[1]["x"] / width,
                "y": norm_vertices[1]["y"] / height,
            },
            {
                "x": norm_vertices[2]["x"] / width,
                "y": norm_vertices[2]["y"] / height,
            },
            {
                "x": norm_vertices[3]["x"] / width,
                "y": norm_vertices[3]["y"] / height,
            }
        ]
    }

def norm_to_abs(norm_vertices, width, height):
    return {
        "vertices": [
            {
                "x": norm_vertices[0]["x"] * width,
                "y": norm_vertices[0]["y"] * height,
            },
            {
                "x": norm_vertices[1]["x"] * width,
                "y": norm_vertices[1]["y"] * height,
            },
            {
                "x": norm_vertices[2]["x"] * width,
                "y": norm_vertices[2]["y"] * height,
            },
            {
                "x": norm_vertices[3]["x"] * width,
                "y": norm_vertices[3]["y"] * height,
            }
        ]
    }

def combine_bounding_boxes(bounding_boxes):
    combined_boxes = []
    to_combine = []

    for box in bounding_boxes:
        br_corner = box[2]  # bottom right corner
        if not to_combine:
            to_combine.append(box)
        else:
            last_br_corner = to_combine[-1][2]
            if abs(br_corner['y'] - last_br_corner['y']) < 0.005:
                to_combine.append(box)
            else:
                combined_boxes.append(combine_boxes(to_combine))
                to_combine = [box]

    if to_combine:
        combined_boxes.append(combine_boxes(to_combine))

    return combined_boxes

def combine_boxes(boxes):
    x1 = min(box[0]['x'] for box in boxes)
    y1 = min(box[0]['y'] for box in boxes)
    x2 = max(box[2]['x'] for box in boxes)
    y2 = max(box[2]['y'] for box in boxes)
    return [{'x': x1, 'y': y1}, {'x': x2, 'y': y1}, {'x': x2, 'y': y2}, {'x': x1, 'y': y2}]

# Function to clean and split the entity_text into words
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text).split()

# Function to calculate bounding box distance
def bounding_box_distance(box1, box2):
    min_distance = float('inf')
    for point1 in box1:
        for point2 in box2:
            dist_x = abs(point1['x'] - point2['x'])
            dist_y = abs(point1['y'] - point2['y'])
            distance = min(dist_x, dist_y)
            if distance < min_distance:
                min_distance = distance
    return min_distance

def preprocess_string(str):
    return str.replace("-", " ").strip()


def find_match_words(ocr_words, in_str_words, true_bb=[], threshold=0.2):
    matched_words, matched_bb = [], []
    i = 0
    j = 0

    while j < len(ocr_words):
        ocr_word, ocr_bb = ocr_words[j]

        if preprocess_word(ocr_word) == preprocess_word(in_str_words[i]):
            if true_bb:
                if abs(bounding_box_distance(true_bb, ocr_bb)) > threshold:
                    j += 1
                    continue

            matched_words.append(ocr_word)
            matched_bb.append(ocr_bb)
            i += 1

            if i == len(in_str_words):
                return matched_words, matched_bb
        else:
            
            if i != 0:
                j -= 1
            i = 0
            
            matched_words, matched_bb = [], []

        j += 1

    return [], []

def get_split_values(input_string):

    in_str_words = split_string(input_string)
    in_str_words_temp_temp = []
    for word_temp in in_str_words:
        if preprocess_word(word_temp) != "":
            in_str_words_temp_temp.append(word_temp)
    print("STRING:", input_string)
    print("STRING LIST:", in_str_words_temp_temp)

def get_ocr_words(json_data, width, height):
    ocr_words = []
    pages = json_data['fullTextAnnotation']['pages']

    for page in pages:
        for block in page['blocks']:
            for paragraph in block['paragraphs']:
                for word in paragraph['words']:
                    word_text = ''.join([symbol['text'] for symbol in word['symbols']])
                    if preprocess_word(word_text).strip() != "":
                        vertices = []
                        if "vertices" in word['boundingBox']:
                            if word['boundingBox']["vertices"]:
                                vertices = abs_to_norm(word['boundingBox']["vertices"], width, height)["vertices"]
                                
                        if "normalizedVertices" in word['boundingBox']:
                            if word['boundingBox']["normalizedVertices"]:
                                vertices = word['boundingBox']['normalizedVertices']
                        ocr_words.append((word_text, vertices))

    with open("out_json.json", "w") as f:
        json.dump({
            "words" : ocr_words
        }, f, indent=4)

    return ocr_words

def find_sequential_symbols(input_string, given_type, ocr_words):

    in_str_words = split_string(input_string)
    in_str_words_temp_temp = []
    for word_temp in in_str_words:
        if preprocess_word(word_temp) != "":
            in_str_words_temp_temp.append(word_temp)
            
    while True:
        if "" in in_str_words_temp_temp:
            in_str_words_temp_temp.remove("")
        else:
            break
        
    in_str_words = in_str_words_temp_temp
    length_match = 0
    all_matched_words, all_matched_bb = [], []
    true_bb = []
    
    while True:
        if length_match != len(in_str_words):
            matched_words, matched_bb = find_match_words(ocr_words, in_str_words[length_match:], true_bb)
            length_match += len(matched_words)
            if not matched_words:
                in_str_words_temp = in_str_words[length_match:]
                for i in range(len(in_str_words_temp)):
                    matched_words_temp, matched_bb_temp = find_match_words(ocr_words, in_str_words_temp[:-(i+1)], true_bb)
                    if len(matched_words_temp) == len(in_str_words_temp[:-(i+1)]):
                        all_matched_words += matched_words_temp
                        all_matched_bb += matched_bb_temp
                        true_bb = matched_bb_temp[-1]
                        length_match += len(matched_words_temp)
                        break
            else:
                all_matched_words += matched_words
                all_matched_bb += matched_bb
                
                true_bb = matched_bb[-1]
        else:
            break

    normalized_vertices = combine_bounding_boxes(all_matched_bb)

    return {
        'mentionText': input_string,
        'pageAnchor': {
            'pageRefs': [{
                'boundingPoly': {
                    'normalizedVertices': [{'vertices': normalized_vertices}]
                }
            }]
        },
        'type': 'text'
    }

In [4]:
final_data = []

if file_type == "pdf":
    for response in json_data_temp["responses"]:
        if "fullTextAnnotation" in response:
            width, height = response["fullTextAnnotation"]["pages"][0]["width"], response["fullTextAnnotation"]["pages"][0]["height"]
            if "text" in response["fullTextAnnotation"]:
                
                input_string = response["fullTextAnnotation"]["text"]
                data = process_input(input_string.replace("\n", " ").strip(), entities_list)
                if not validate_json(data):
                    raise Exception("JSON not valid")
                
                data["text"] = input_string

                ocr_words = get_ocr_words(response, width, height)
                
                for entity in data["entities"]:
                    entity_text = entity["entity"]
                    if entity_text in ["Not Found", "Not Privided", "Not Given"]:
                        continue
                    entity_type = entity["type"]

                    try:
                        output = find_sequential_symbols(entity_text, "pdf", ocr_words)
                    except Exception as e:
                        output = {
                            'mentionText': "",
                            'pageAnchor': {
                                'pageRefs': [{
                                    'boundingPoly': {
                                        'normalizedVertices': [{'vertices': []}]
                                    }
                                }]
                            },
                            'type': 'text'
                        }
                        print(f"\nFailed for entity {entity_type}: {entity_text} == Reason : {e}")
                        get_split_values(entity_text)

                    output["mentionText"] = entity_text
                    output["type"] = entity_type
                    output["pageNumber"] = response["context"]["pageNumber"]
                    final_data.append(output)

            else:
                raise Exception("No input string found in json")
else:

    if "fullTextAnnotation" in json_data_temp:
        width, height = json_data_temp["fullTextAnnotation"]["pages"][0]["width"], json_data_temp["fullTextAnnotation"]["pages"][0]["height"]
        if "text" in json_data_temp["fullTextAnnotation"]:
            input_string = json_data_temp["fullTextAnnotation"]["text"]
            data = process_input(input_string.replace("\n", " ").strip(), entities_list)
            if not validate_json(data):
                raise Exception("JSON not valid")
            
            data["text"] = input_string

            ocr_words = get_ocr_words(json_data_temp, width, height)
            
            for entity in data["entities"]:
                
                entity_text = entity["entity"]
                if entity_text == "Not Found":
                    continue
                entity_type = entity["type"]
                try:
                    output = find_sequential_symbols(
                        entity_text,
                        "image",
                        ocr_words
                    )
                except Exception as e:
                    output = {
                        'mentionText': "",
                        'pageAnchor': {
                            'pageRefs': [{
                                'boundingPoly': {
                                    'normalizedVertices': [{'vertices': []}]
                                }
                            }]
                        },
                        'type': 'text'
                    }
                    print(f"\nFailed for entity {entity_type}: {entity_text} == Reason : {e}")
                    get_split_values(entity_text)
                output["mentionText"] = entity_text
                output["type"] = entity_type
                final_data.append(output)
        else:
            raise Exception("No input string found in json")

In [5]:
final_data

[]

In [5]:
# IMAGE

In [7]:
from PIL import Image, ImageDraw
from typing import List, Dict, Any
import random

# Function to generate a random color
def random_color():
    return tuple(random.randint(0, 255) for _ in range(3))

def draw_bounding_boxes_on_images(image_path: str, output_path: str, data: List[Dict[str, Any]]):
    # Open the image
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)
    
    # Dictionary to store colors for each type
    colors = {}

    # Iterate over each item in the data list
    for item in data:
        entity_type = item['type']
        # Assign a random color if this entity type hasn't been seen yet
        if entity_type not in colors:
            colors[entity_type] = random_color()

        color = colors[entity_type]

        for page_ref in item['pageAnchor']['pageRefs']:
            bounding_poly = page_ref['boundingPoly']['normalizedVertices']
            for vertices_group in bounding_poly:
                for vertices in vertices_group['vertices']:
                    # Calculate the coordinates
                    width, height = image.size
                    points = [(vertex['x'] * width, vertex['y'] * height) for vertex in vertices]
                    # Draw the rectangle on the image
                    draw.polygon(points, outline=color, width=2)
    
    # Save the modified image
    image.save(output_path)

    # Example usage
data2 = final_data

image_path = r"D:\...\AutoLabelling\Other\new\files\109851.jpg"  # Path to your input image file
output_path = "output.jpg"  # Path to save the output image file

draw_bounding_boxes_on_images(image_path, output_path, data2)

In [14]:
import fitz  # PyMuPDF
from typing import List, Dict, Any
import random

def random_color():
    return tuple(random.randint(0, 255)/255 for _ in range(3))

def draw_bounding_boxes(pdf_path: str, output_path: str, data: List[Dict[str, Any]]) -> None:
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    for entry in data:
        color=random_color()
        page_anchors = entry.get('pageAnchor', {}).get('pageRefs', [])
        page = pdf_document[int(entry.get('pageNumber', 0)) - 1]
        print(page)
        for page_ref in page_anchors:
            bounding_poly = page_ref.get('boundingPoly', {})
            vertices_k = bounding_poly.get('normalizedVertices', [{}])
            if vertices_k:
                vertices_m = vertices_k[0].get('vertices', [])
                for vertices in vertices_m:
                    if len(vertices) == 4:
                        # Convert normalized coordinates to PDF coordinates
                        x0 = vertices[0]['x'] * pdf_document[0].rect.width
                        y0 = vertices[0]['y'] * pdf_document[0].rect.height
                        x1 = vertices[1]['x'] * pdf_document[0].rect.width
                        y1 = vertices[1]['y'] * pdf_document[0].rect.height
                        x2 = vertices[2]['x'] * pdf_document[0].rect.width
                        y2 = vertices[2]['y'] * pdf_document[0].rect.height
                        x3 = vertices[3]['x'] * pdf_document[0].rect.width
                        y3 = vertices[3]['y'] * pdf_document[0].rect.height

                        # Define bounding box coordinates
                        rect = fitz.Rect(x0, y0, x2, y2)
                        
                        # Draw rectangle on each page based on data
                        
                        page.draw_rect(rect, color=color, width=2)  # Red color, width 2

    # Save the modified PDF
    pdf_document.save(output_path)
    pdf_document.close()

# Example usage

data2 = final_data
draw_bounding_boxes(r"D:\....\AutoLabelling\Other\new\files\109846.pdf", 'output.pdf', data2)


page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0 of D:\__SECUREKLOUD___\AutoLabelling\Other\new\files\109846.pdf
page 0

In [20]:
final_data

[{'mentionText': '2023-0095',
  'pageAnchor': {'pageRefs': [{'boundingPoly': {'normalizedVertices': [{'vertices': [[{'x': 0.83615476,
           'y': 0.04617358},
          {'x': 0.9159613, 'y': 0.04617358},
          {'x': 0.9159613, 'y': 0.053869177},
          {'x': 0.83615476, 'y': 0.053869177}]]}]}}]},
  'type': 'EASA AD No',
  'pageNumber': 1},
 {'mentionText': 'AIRBUS HELICOPTERS',
  'pageAnchor': {'pageRefs': [{'boundingPoly': {'normalizedVertices': [{'vertices': [[{'x': 0.09431681,
           'y': 0.32578024},
          {'x': 0.2986699, 'y': 0.32578024},
          {'x': 0.2986699, 'y': 0.33646858},
          {'x': 0.09431681, 'y': 0.33646858}]]}]}}]},
  'type': "Design Approval Holder's Name",
  'pageNumber': 1},
 {'mentionText': '22 May 2023',
  'pageAnchor': {'pageRefs': [{'boundingPoly': {'normalizedVertices': [{'vertices': [[{'x': 0.2490931,
           'y': 0.3676785},
          {'x': 0.35489723, 'y': 0.3676785},
          {'x': 0.35489723, 'y': 0.38007694},
          {'x'