In [None]:
# installing necessary libraries
!pip install deepdoctection
!pip install PyMUPDF rapidfuzz pdf2image pillow google-cloud-vision

In [None]:
!apt-get install -y tesseract-ocr
!apt-get install -y poppler-utils
# Restart runtime after installation
# Runtime -> Restart runtime

import os

# Set the path to Tesseract's data directory
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'
print(os.environ['PATH'])              # Verify the PATH

# Set the Poppler directory in PATH
os.environ['PATH'] += ":/usr/bin"
print(os.environ['PATH'])               # Verify the PATH


In [None]:
import deepdoctection as dd
from IPython.core.display import HTML
from matplotlib import pyplot as plt
import pandas as pd
import os
import PyPDF2
import json
import random
from pdf2image import convert_from_path
from PIL import Image
import cv2
import fitz  # PyMuPDF
import numpy as np


analyzer = dd.get_dd_analyzer()  # instantiate the built-in analyzer similar to the Hugging Face space demo

In [None]:

# Directory where you want to save the JSON
output_folder = '/content/output'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Open the complete PDF file
pdf_file_path = '/content/output/page_15.pdf'
pdf_file = open(pdf_file_path, 'rb')

# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(pdf_file)

# Iterate through each page and create a separate PDF for each page
for page_num in range(len(pdf_reader.pages)):
    # Create a new PDF writer object for each page
    pdf_writer = PyPDF2.PdfWriter()

    # Add the current page to the writer
    pdf_writer.add_page(pdf_reader.pages[page_num])

    # Create a new PDF file for the current page in the output folder
    output_pdf_path = os.path.join(output_folder, f'page_{page_num + 1}.pdf')
    with open(output_pdf_path, 'wb') as output_pdf:
        pdf_writer.write(output_pdf)

    #give pdf path of each page to model
    pdf_path = output_pdf_path


    #initialize the pdf path and variables required.
    scores = []
    ulx_val =[]
    uly_val = []
    lrx_val =[]
    lry_val = []
    heights = []
    widths = []
    areas = []
    page_numbers = []
    element_text = []
    category_names = []

    df = analyzer.analyze(path = pdf_path)  # setting up pipeline
    df.reset_state()                 # Trigger some initialization

    doc = iter(df)

    page_number = 1
    for page in doc:
        image = page.viz()
        plt.figure(figsize=(25, 17))
        plt.axis('off')
        plt.imshow(image)
        plt.show()
        print(page)
        for annotation in page.annotations:
        # Extract relevant information from the current annotation
          category_name = annotation._category_name
          score = annotation.score
          height = annotation.bounding_box.height
          width = annotation.bounding_box.width
          area = height * width
          ulx = annotation.bounding_box.ulx
          uly = annotation.bounding_box.uly
          lrx = annotation.bounding_box.lrx
          lry = annotation.bounding_box.lry
          #condition used to extract list and text only
          # if (category_name == 'list' and area > 120000 and score >= 0.4) |(category_name == 'text') |(category_name == 'title'):
          if (category_name == 'list' and area > 120000 and score >= 0.4):
            element_text.append(annotation.text)
            page_numbers.append(page_number)


          # Append extracted information to lists
          category_names.append(category_name)
          scores.append(score)
          heights.append(height)
          widths.append(width)
          areas.append(area)
          ulx_val.append(ulx)
          uly_val.append(uly)
          lrx_val.append(lrx)
          lry_val.append(lry)
        page_number +=1

    data = {
        "category_name": category_names,
        "score": scores,
        "Height" : heights,
        "Width" : widths,
        "area" : areas,
        "Left" : ulx_val,
        "Top" : uly_val,
        "lrx" : lrx_val,
        "lry" : lry_val,
    }

    #adding the information of all the annotations to a dataframe
    df = pd.DataFrame(data)
    pd.options.display.float_format = '{:.2f}'.format

    #getting the details of only list and text from df and saving in another df.
    element_df = df[((df['category_name'] == 'list') & (df['area'] >= 120000)) & (df['score'] >= 0.4)]
    element_df = element_df[['category_name', 'score','Height','Width','Left','Top','lrx','lry']]
    element_df['text'] = element_text
    element_df['Page No'] = page_numbers

    for image_id, data in page.embeddings.items():
        original_image_height = data.height
        original_image_width = data.width
pdf_file.close()

In [None]:
element_df = element_df[['Page No', 'Left', 'Top', 'Width', 'Height']]
print(element_df)
element_df[['New Left', 'New Top', 'New Width', 'New Height']] = element_df.apply(
    lambda row: manipulate_coordinates(row['Left'], row['Top'], row['Width'], row['Height'], original_image_height, original_image_width),
    axis=1,
    result_type='expand'
)

# Select the desired columns
new_df = element_df[['Page No', 'New Left', 'New Top', 'New Width', 'New Height']]

# Print the new DataFrame
new_df


In [None]:
def manipulate_coordinates(left,top,width,height,page_height,page_width):
    # Scale coordinates for the new image size (825x1088)
    scale_factor_x = 1023 / page_width  #
    scale_factor_y = 825 / page_height  #
    print(scale_factor_x)
    new_left = (left * scale_factor_x) - 5
    new_top = (top * scale_factor_y) - 5
    new_height = (height * scale_factor_y) + 6
    new_width = (width * scale_factor_x ) + 6

    return new_left, new_top, new_height, new_width

In [None]:
# Load the single-page PDF
pdf_file_path = '/content/output/page_2.pdf'
pdf_document = fitz.open(pdf_file_path)

# Select the page (assuming it's the first page)
page = pdf_document[0]

# Convert the page to an image
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))

# Convert the image to a NumPy array
image_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)

# Load the element data from your DataFrame (element_df)
# Assuming you have a DataFrame with columns: 'Left', 'Top', 'lrx', and 'lry'
# Replace this with your actual DataFrame and column names
# element_df = pd.read_csv('your_element_data.csv')

# Extract the coordinates from the DataFrame
x1_values = element_df['Left'].astype(int).tolist()
y1_values = element_df['Top'].astype(int).tolist()
x2_values = element_df['lrx'].astype(int).tolist()
y2_values = element_df['lry'].astype(int).tolist()

# Create a mask with the same size as the image
mask = np.zeros_like(image_data)

# Apply the masks for each set of coordinates
for x1, y1, x2, y2 in zip(x1_values, y1_values, x2_values, y2_values):
    mask[y1:y2, x1:x2, :] = 255  # Set the region within the coordinates to white

# Apply the final mask to the image
masked_image = cv2.bitwise_and(image_data, mask)

# Save the resulting image as a new image (e.g., JPEG)
cv2.imwrite('masked_image.jpg', masked_image)


In [None]:
!pip install google-cloud-vision

In [None]:
!unzip -q "/content/image-to-livetext-1 (2).zip"

In [None]:
    import os
    import json
    from google.cloud import vision
    from google.oauth2 import service_account

    # Set up credentials and environment
    credentials_path = r'/content/image-to-livetext-1/credentials.json'
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
    client = vision.ImageAnnotatorClient()


    def detect_text(image_path):
        # Reads the image file
        with open(image_path, 'rb') as image_file:
            content = image_file.read()
        image = vision.Image(content=content)

        # Performs text detection on the image file
        response = client.text_detection(image=image)

        fullTextAnnotation = response.full_text_annotation
        json_data = vision.TextAnnotation.to_json(fullTextAnnotation)

        return json_data

    import json
    import os
    from jinja2 import Environment, FileSystemLoader
    from PIL import Image


    def get_image_size(image_path):
          with Image.open(image_path) as img:
              return img.size

    def process_images_in_folder(folder_path):
          # Create a folder for JSON files
          parent_folder = os.path.dirname(folder_path)
          json_folder = os.path.join(parent_folder, 'jsons')
          html_folder = os.path.join(parent_folder, 'htmls')
          os.makedirs(json_folder, exist_ok=True)
          os.makedirs(html_folder, exist_ok=True)

          # Iterate over the images in the folder
          for filename in os.listdir(folder_path):
              if filename.endswith('.jpg') or filename.endswith('.png'):
                  image_path = os.path.join(folder_path, filename)

                  # Check if a corresponding JSON file already exists
                  response_file = os.path.splitext(filename)[0] + '.json'
                  response_path = os.path.join(json_folder, response_file)
                  json_exists = os.path.exists(response_path)

                  if not json_exists:
                      text_annotation = detect_text(image_path)
                      # print(text_annotation)

                      with open(response_path, 'w', encoding='utf-8') as fle:
                          fle.write(text_annotation)

                  else:
                      # Read the contents of the existing JSON file
                      with open(response_path, 'r') as fle:
                          text_annotation = fle.read() # json.load(fle)
                          # print(text_annotation)

    def main():
      folder_path = '/content/image-to-livetext-1/data/image'
      process_images_in_folder(folder_path)

    if __name__ == "__main__":
        main()


In [None]:
    import os
    import json
    import pandas as pd

    # Specify the folder containing the JSON files
    file_path = f"/content/image-to-livetext-1/data/jsons"

    # Function to extract text from JSON element
    def extract_text(element):
        if "text" in element:
            return element["text"]
        elif "symbols" in element:
            return "".join([symbol["text"] for symbol in element["symbols"]])
        elif "words" in element:
            return " ".join([extract_text(word) for word in element["words"]])
        elif "paragraphs" in element:
            return "\n".join([extract_text(paragraph) for paragraph in element["paragraphs"]])
        else:
            return ""

    # Create lists to store the data
    #file_list = []
    text_list = []
    left_list = []
    width_list = []
    height_list = []
    top_list = []

    with open(file_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)

            # Extract text, paragraphs, and their bounding boxes
        text = data["text"]
        paragraphs = []

        for page in data["pages"]:
                for block in page["blocks"]:
                    if "paragraphs" in block:
                        for paragraph in block["paragraphs"]:
                            paragraph_text = extract_text(paragraph)
                            paragraph_bbox = paragraph["boundingBox"]["vertices"]

                            # Extract coordinates
                            left, top = paragraph_bbox[0]['x'], paragraph_bbox[0]['y']
                            width = paragraph_bbox[2]['x'] - left
                            height = paragraph_bbox[2]['y'] - top

                            # Append data to the lists
                            #file_list.append(filename)
                            text_list.append(paragraph_text)
                            left_list.append(left)
                            top_list.append(top)
                            width_list.append(width)
                            height_list.append(height)

    # Create a DataFrame
    para_df = pd.DataFrame({
        #"File": file_list,
        "text": text_list,
        "Left": left_list,
        "Top": top_list,
        "Width": width_list,
        "Height": height_list
    })

