In [1]:
import os
import re
import sys
import glob
import json
import pymupdf
from pathlib import Path

In [5]:
API_SPEC_PATH = "../pdfs/api_specification.pdf"
GUIDELINE_PATH = "../pdfs/guideline.pdf"

SPLITTED_PDF_PATH = "../pdfs/splitted"
ANALYZED_JSON_PATH = "../analyzed_jsons"
CROPPED_RESOURCES_PATH = "../cropped_pdf_resources"

In [3]:
sys.path.append(os.path.abspath('../'))

In [4]:
from src.ingestion.states import FileState

In [46]:
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

In [47]:
def extract_page_elements(file_basename):
    file_paths = glob.glob(os.path.join(ANALYZED_JSON_PATH, file_basename, file_basename + "*"))
    sorted_file_paths = sorted(file_paths, key=natural_sort_key)
    page_elements = dict()

    element_id = 0

    for fp in sorted_file_paths:
        with open(fp, "r") as f:
            data = json.load(f)

        for element in data["elements"]:
            num_page = int(Path(fp).stem.split("_")[-1])

            if num_page not in page_elements:
                page_elements[num_page] = []
            
            element["id"] = element_id
            element_id += 1

            element["page"] = num_page

            page_elements[num_page].append(element)
    
    sorted_page_elements = dict(sorted(page_elements.items()))
    return sorted_page_elements

In [48]:
guideline_page_elements = extract_page_elements("guideline")

In [50]:
def extract_page_size(file_basename, page_metadata):
    file_paths = glob.glob(os.path.join(ANALYZED_JSON_PATH, file_basename, file_basename + "*"))

    for fp in file_paths:
        with open(fp, "r") as f:
            page_data = json.load(f)
    
            num_page = int(Path(fp).stem.split("_")[-1])

            for element in page_data["metadata"]["pages"]:
                metadata = {
                    "size" : [
                        int(element["width"]),
                        int(element["height"]),
                    ],
                }
                
                page_metadata[num_page] = metadata
    
    sorted_page_metadata = dict(sorted(page_metadata.items()))         
    return sorted_page_metadata

In [51]:
page_metadata = dict()
file_basename = "guideline"

extract_page_size(file_basename, page_metadata)

{0: {'size': [1157, 1571]},
 1: {'size': [1157, 1571]},
 2: {'size': [1157, 1571]},
 3: {'size': [1157, 1571]},
 4: {'size': [1157, 1571]},
 5: {'size': [1157, 1571]},
 6: {'size': [1157, 1571]},
 7: {'size': [1157, 1571]},
 8: {'size': [1157, 1571]},
 9: {'size': [1157, 1571]},
 10: {'size': [1157, 1571]},
 11: {'size': [1157, 1571]},
 12: {'size': [1157, 1571]},
 13: {'size': [1157, 1571]},
 14: {'size': [1157, 1571]},
 15: {'size': [1157, 1571]},
 16: {'size': [1157, 1571]},
 17: {'size': [1157, 1571]},
 18: {'size': [1157, 1571]},
 19: {'size': [1157, 1571]},
 20: {'size': [1157, 1571]},
 21: {'size': [1157, 1571]},
 22: {'size': [1157, 1571]},
 23: {'size': [1157, 1571]},
 24: {'size': [1157, 1571]},
 25: {'size': [1157, 1571]},
 26: {'size': [1157, 1571]},
 27: {'size': [1157, 1571]},
 28: {'size': [1157, 1571]},
 29: {'size': [1157, 1571]},
 30: {'size': [1157, 1571]},
 31: {'size': [1157, 1571]},
 32: {'size': [1157, 1571]},
 33: {'size': [1157, 1571]},
 34: {'size': [1157, 157

In [52]:
def extract_tag_elements_per_page(page_elements):
    splitted_page_elements = dict()

    for key, page_element in page_elements.items():
        figure_elements = []
        table_elements = []
        text_elements = []

        for element in page_element:
            if element["category"] == "figure":
                figure_elements.append(element)
            elif element["category"] == "table":
                table_elements.append(element)
            else:
                text_elements.append(element)

        splitted_page_elements[key] = {
            "figure_elements": figure_elements,
            "table_elements": table_elements,
            "text_elements": text_elements,
            "elements": page_element,
        }

    return splitted_page_elements

In [63]:
guideline_tag_elements = extract_tag_elements_per_page(guideline_page_elements)

In [64]:
guideline_tag_elements[70]["figure_elements"][0]

{'bounding_box': [{'x': 145, 'y': 1048},
  {'x': 1007, 'y': 1048},
  {'x': 1007, 'y': 1390},
  {'x': 145, 'y': 1390}],
 'category': 'figure',
 'html': '<br><figure><img id=\'10\' style=\'font-size:14px\' alt="② 개별인증 및\n전송요구 완료\n개별인증\n요청 정보제공자 A\nID/PWD\n3 개별인증 순차\n요청 SMS 개별인증 및\n전송요구 완료\n5 개별인증 정보제공자 B 수행\n고객 마이데이터 요청\n서비스\n⑥ 개별인증 및\n전송요구 완료\n정보제공자 C" data-coord="top-left:(145,1048); bottom-right:(1007,1390)" /></figure>',
 'id': 622,
 'page': 70,
 'text': '② 개별인증 및\n전송요구 완료\n개별인증\n요청 정보제공자 A\nID/PWD\n3 개별인증 순차\n요청 SMS 개별인증 및\n전송요구 완료\n5 개별인증 정보제공자 B 수행\n고객 마이데이터 요청\n서비스\n⑥ 개별인증 및\n전송요구 완료\n정보제공자 C'}

In [65]:
class ImageCropper:
    @staticmethod
    def pdf_to_image(pdf_file, page_num, dpi=300):
        with pymupdf.open(pdf_file) as doc:
            page = doc[page_num].get_pixmap(dpi=dpi)
            target_page_size = [page.width, page.height]
            page_img = Image.frombytes("RGB", target_page_size, page.samples)
        return page_img

    @staticmethod
    def normalize_coordinates(coordinates, output_page_size):
        x_values = [coord["x"] for coord in coordinates]
        y_values = [coord["y"] for coord in coordinates]
        x1, y1, x2, y2 = min(x_values), min(y_values), max(x_values), max(y_values)

        return (
            x1 / output_page_size[0],
            y1 / output_page_size[1],
            x2 / output_page_size[0],
            y2 / output_page_size[1],
        )

    @staticmethod
    def crop_image(img, coordinates, output_file):
        img_width, img_height = img.size
        x1, y1, x2, y2 = [
            int(coord * dim)
            for coord, dim in zip(coordinates, [img_width, img_height] * 2)
        ]
        cropped_img = img.crop((x1, y1, x2, y2))
        cropped_img.save(output_file)

In [49]:
def crop_image(file_basename):
    cropped_images = dict()
    

{0: [{'bounding_box': [{'x': 74, 'y': 84},
    {'x': 1072, 'y': 84},
    {'x': 1072, 'y': 1484},
    {'x': 74, 'y': 1484}],
   'category': 'figure',
   'html': '<figure><img id=\'0\' style=\'font-size:14px\' alt="금융분야\n마이데이터 기술\n가이드라인\n2022.10.\n금융위원회 금융보안원\nFINANCIAL SECURITY INSTITUTE" data-coord="top-left:(74,84); bottom-right:(1072,1484)" /></figure>',
   'id': 0,
   'page': 0,
   'text': '금융분야\n마이데이터 기술\n가이드라인\n2022.10.\n금융위원회 금융보안원\nFINANCIAL SECURITY INSTITUTE'}],
 1: [{'bounding_box': [{'x': 291, 'y': 108},
    {'x': 864, 'y': 108},
    {'x': 864, 'y': 197},
    {'x': 291, 'y': 197}],
   'category': 'heading1',
   'html': "<h1 id='0' style='font-size:18px'>『금융분야 마이데이터 기술 가이드라인』<br>이용 안내</h1>",
   'id': 1,
   'page': 1,
   'text': '『금융분야 마이데이터 기술 가이드라인』\n이용 안내'},
  {'bounding_box': [{'x': 160, 'y': 369},
    {'x': 997, 'y': 369},
    {'x': 997, 'y': 492},
    {'x': 160, 'y': 492}],
   'category': 'paragraph',
   'html': "<p id='1' data-category='paragraph' style='font-size:14px'