In [38]:
import os
import glob
import json
import pymupdf
from pathlib import Path

In [6]:
SPLITTED_PDF_PATH = "../pdfs/splitted"
ANALYZED_JSON_PATH = "../analyzed_jsons"

In [33]:
def extract_page_elements(file_basename):
    file_paths = glob.glob(os.path.join(ANALYZED_JSON_PATH, file_basename, file_basename + "*"))
    page_elements = dict()

    element_id = 0

    for fp in file_paths:
        with open(fp, "r") as f:
            data = json.load(f)

        for element in data["elements"]:
            num_page = int(Path(fp).stem.split("_")[-1])

            if num_page not in page_elements:
                page_elements[num_page] = []
            
            element["id"] = element_id
            element_id += 1

            element["page"] = num_page

            page_elements[num_page].append(element)
    
    sorted_page_elements = dict(sorted(page_elements.items()))
    return sorted_page_elements

In [36]:
guideline_page_elements = extract_page_elements("guideline")

In [25]:
def extract_page_size(file_basename, page_metadata):
    file_paths = glob.glob(os.path.join(ANALYZED_JSON_PATH, file_basename, file_basename + "*"))

    for fp in file_paths:
        with open(fp, "r") as f:
            page_data = json.load(f)
    
            num_page = int(Path(fp).stem.split("_")[-1])

            for element in page_data["metadata"]["pages"]:
                metadata = {
                    "size" : [
                        int(element["width"]),
                        int(element["height"]),
                    ],
                }
                
                page_metadata[num_page] = metadata
    
    sorted_page_metadata = dict(sorted(page_metadata.items()))         
    return sorted_page_metadata

In [26]:
page_metadata = dict()
file_basename = "guideline"

extract_page_size(file_basename, page_metadata)

{0: {'size': [1157, 1571]},
 1: {'size': [1157, 1571]},
 2: {'size': [1157, 1571]},
 3: {'size': [1157, 1571]},
 4: {'size': [1157, 1571]},
 5: {'size': [1157, 1571]},
 6: {'size': [1157, 1571]},
 7: {'size': [1157, 1571]},
 8: {'size': [1157, 1571]},
 9: {'size': [1157, 1571]},
 10: {'size': [1157, 1571]},
 11: {'size': [1157, 1571]},
 12: {'size': [1157, 1571]},
 13: {'size': [1157, 1571]},
 14: {'size': [1157, 1571]},
 15: {'size': [1157, 1571]},
 16: {'size': [1157, 1571]},
 17: {'size': [1157, 1571]},
 18: {'size': [1157, 1571]},
 19: {'size': [1157, 1571]},
 20: {'size': [1157, 1571]},
 21: {'size': [1157, 1571]},
 22: {'size': [1157, 1571]},
 23: {'size': [1157, 1571]},
 24: {'size': [1157, 1571]},
 25: {'size': [1157, 1571]},
 26: {'size': [1157, 1571]},
 27: {'size': [1157, 1571]},
 28: {'size': [1157, 1571]},
 29: {'size': [1157, 1571]},
 30: {'size': [1157, 1571]},
 31: {'size': [1157, 1571]},
 32: {'size': [1157, 1571]},
 33: {'size': [1157, 1571]},
 34: {'size': [1157, 157

In [35]:
def extract_tag_elements_per_page(page_elements):
    splitted_page_elements = dict()

    for key, page_element in page_elements.items():
        figure_elements = []
        table_elements = []
        text_elements = []

        for element in page_element:
            if element["category"] == "figure":
                figure_elements.append(element)
            elif element["category"] == "table":
                table_elements.append(element)
            else:
                text_elements.append(element)

        splitted_page_elements[key] = {
            "figure_elements": figure_elements,
            "table_elements": table_elements,
            "text_elements": text_elements,
            "elements": page_element,
        }

    return splitted_page_elements

In [39]:
guideline_tag_elements = extract_tag_elements_per_page(guideline_page_elements)

In [42]:
guideline_tag_elements[56]["figure_elements"][0]

{'bounding_box': [{'x': 138, 'y': 731},
  {'x': 1020, 'y': 731},
  {'x': 1020, 'y': 1219},
  {'x': 138, 'y': 1219}],
 'category': 'figure',
 'html': '<br><figure><img id=\'5\' style=\'font-size:16px\' alt="고객 마이데이터사업자 정보제공자\n개인신용정보 전송 요구\n② 전송 요구사항 전달\n본인인증\n접근토큰 발급 요청\n접근토큰 발급" data-coord="top-left:(138,731); bottom-right:(1020,1219)" /></figure>',
 'id': 202,
 'page': 56,
 'text': '고객 마이데이터사업자 정보제공자\n개인신용정보 전송 요구\n② 전송 요구사항 전달\n본인인증\n접근토큰 발급 요청\n접근토큰 발급'}

In [None]:
class ImageCropper:
    @staticmethod
    def pdf_to_image(pdf_file, page_num, dpi=300):
        with pymupdf.open(pdf_file) as doc:
            page = doc[page_num].get_pixmap(dpi=dpi)
            target_page_size = [page.width, page.height]
            page_img = Image.frombytes("RGB", target_page_size, page.samples)
        return page_img

    @staticmethod
    def normalize_coordinates(coordinates, output_page_size):
        x_values = [coord["x"] for coord in coordinates]
        y_values = [coord["y"] for coord in coordinates]
        x1, y1, x2, y2 = min(x_values), min(y_values), max(x_values), max(y_values)

        return (
            x1 / output_page_size[0],
            y1 / output_page_size[1],
            x2 / output_page_size[0],
            y2 / output_page_size[1],
        )

    @staticmethod
    def crop_image(img, coordinates, output_file):
        img_width, img_height = img.size
        x1, y1, x2, y2 = [
            int(coord * dim)
            for coord, dim in zip(coordinates, [img_width, img_height] * 2)
        ]
        cropped_img = img.crop((x1, y1, x2, y2))
        cropped_img.save(output_file)