In [39]:
import os
import json
import random
import asyncio
import aiohttp
import logging
import pymupdf
import nest_asyncio
from pathlib import Path

from aiohttp import FormData
from dotenv import load_dotenv

In [40]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [41]:
current_dir = os.getcwd()
dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')

load_dotenv(dotenv_path)

True

In [42]:
API_SPEC_PATH = "../pdfs/api_specification.pdf"
GUIDELINE_PATH = "../pdfs/guideline.pdf"

SPLITTED_PDF_PATH = "../pdfs/splitted"
ANALYZED_JSON_PATH = "../analyzed_jsons"

In [43]:
UPSTAGE_INFERENCE_URL = "https://ocr-demo.upstage.ai/api/layout-analysis/inference"
UPSTAGE_RESULT_BASE_URL = "https://ocr-demo.upstage.ai/api/result/"

In [44]:
upstage_api_headers = {"Accept": "*/*",
                       "origin": "https://d3tgkvf102zvh7.cloudfront.net",
                       "priority": "u=1, i",
                       "referer": "https://d3tgkvf102zvh7.cloudfront.net/",
                       "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
                       }

In [45]:
class FileManager:
    def __init__(self, file_path):
        self.file_path = file_path
        self.file_name = Path(file_path).stem
        self.file_type = Path(file_path).suffix
        self.num_total_page = 0
        self.splitted_file_paths = []
        self.send_document_analysis_request_id_map = self._init_request_result_map("request_ids.json")
        self.send_get_result_request_map = self._init_request_result_map("json_results.json")
    
    def split_pages(self, split_size=1):
        splitted_files = []
        base_file = pymupdf.open(self.file_path)
        self.num_total_page = base_file.page_count
        os.makedirs(os.path.join(SPLITTED_PDF_PATH, self.file_name), exist_ok=True)

        for start_page in range(0, self.num_total_page, split_size):
            end_page = min(start_page+split_size, self.num_total_page) -1
            result_file_name = f"{self.file_name}_{start_page:03d}_{end_page:03d}.pdf"
            with pymupdf.open() as result_pdf:
                result_pdf.insert_pdf(base_file, from_page=start_page, to_page=end_page)
                result_file_path = os.path.join(SPLITTED_PDF_PATH, self.file_name, result_file_name)
                result_pdf.save(result_file_path)
                splitted_files.append(result_pdf)
                self.splitted_file_paths.append(result_file_path)
        
        base_file.close()
        return splitted_files

    def _init_request_result_map(self, saved_file_name):
        request_id_map = {}
        file_path = os.path.join(ANALYZED_JSON_PATH, self.file_name, saved_file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                request_id_map = json.load(file)
        return request_id_map

    async def _send_document_analysis_requests(self, file_paths):
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
            tasks = [self._delayed_document_analysis_request(session, fp) for fp in file_paths]

            results = await asyncio.gather(*tasks, return_exceptions=True)
            for idx, result in enumerate(results):
                if isinstance(result, Exception):
                    logger.error(f"Request {idx + 1}: Failed with error {result}")
                else:
                    file_path = result.get("file_path")
                    request_id = result.get("request_id")
                    logger.info(f"Request {idx + 1} File path[{file_path}]: Request ID = {request_id}")
                    self.send_document_analysis_request_id_map[file_path] = request_id
            self.save_request_result()
    
    async def _delayed_document_analysis_request(self, session, file_path):
        await asyncio.sleep(random.uniform(1, 3))
        return await self._send_document_analysis_request(session, file_path)
                
    async def _send_document_analysis_request(self, session, file_path):
        request_id = None

        form = FormData()
        form.add_field("token", os.environ.get('UPSTAGE_TOKEN'))
        form.add_field("serviceName", "document-ai")
        form.add_field("type", "drsp")
        form.add_field("url", "receipt-extraction-3.2.0")
        form.add_field("document", open(file_path, 'rb'), filename=file_path, content_type='application/pdf')

        try:
            async with session.post(url=UPSTAGE_INFERENCE_URL, headers=upstage_api_headers, data=form) as response:
                if response.status == 200:
                    json_response = await response.json()
                    request_id = json_response.get('requestId')
                else:
                    logger.error(f"[Inference Req Failed]: Status {response.status}")
        except aiohttp.ClientError as e:
            logger.error(f"[Inference Req Error]: {e}")

        return {"file_path": file_path, "request_id": request_id}

    async def _send_get_result_requests(self, request_id_map):
        os.makedirs(os.path.join(ANALYZED_JSON_PATH, self.file_name), exist_ok=True)
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
            tasks = [self._delayed_get_result_request(session, file_path, request_id) 
                     for file_path, request_id in request_id_map.items()]
            
            results = await asyncio.gather(*tasks, return_exceptions=True)
            for idx, result in enumerate(results):
                if isinstance(result, Exception):
                    logger.error(f"Get Result Request {idx + 1}: Failed with error {result}")
                else:
                    file_path = result.get("file_path")
                    result_file_path = result.get("result")
                    self.send_get_result_request_map[file_path] = result_file_path
                    logger.info(f"Get Result Request {idx + 1} File path[{file_path}]: Successfully saved result.")
            self.save_result_request_result()
    
    async def _delayed_get_result_request(self, session, file_path, request_id):
        await asyncio.sleep(random.uniform(1, 3))
        return await self._send_get_result_request(session, file_path, request_id)
    
    async def _send_get_result_request(self, session, file_path, request_id):
        result_file_path = os.path.join(ANALYZED_JSON_PATH, self.file_name, Path(file_path).stem + ".json")
        try:
            async with session.get(url=os.path.join(UPSTAGE_RESULT_BASE_URL, request_id), headers=upstage_api_headers) as response:
                if response.status == 200:
                    result_json = await response.json()
                    with open(result_file_path, "w") as f:
                        json.dump(result_json, f, ensure_ascii=False)
                    return {"file_path": file_path, "result": result_file_path}
                else:
                    logger.error(f"[Result API Call Error]: Status {response.status}")
        except aiohttp.ClientError as e:
            logger.error(f"[Result Req Error]: {e}")

        return {"file_path": file_path, "result": None}

    def save_request_result(self):
        try:
            os.makedirs(os.path.join(ANALYZED_JSON_PATH, self.file_name), exist_ok=True)
            with open(os.path.join(ANALYZED_JSON_PATH, self.file_name, "request_ids.json"), 'w') as f:
                json.dump(self.send_document_analysis_request_id_map, f, indent=4)
        except IOError as e:
            logger.error(f"[Save Request ID Results Error]: {e}")
    
    def save_result_request_result(self):
        try:
            with open(os.path.join(ANALYZED_JSON_PATH, self.file_name, "json_results.json"), 'w') as f:
                json.dump(self.send_get_result_request_map, f, indent=4)
        except IOError as e:
            logger.error(f"[Save JSON Result Error]: {e}")
    
    def excute_analysis_requests(self):
        try:
            nest_asyncio.apply()
            if not self.send_document_analysis_request_id_map:
                target_paths = self.splitted_file_paths
            else:
                target_paths = [key for key, value in self.send_document_analysis_request_id_map.items() if value is None]
            asyncio.run(self._send_document_analysis_requests(target_paths))
        except RuntimeError as e:
            if "asyncio.run() cannot be called from a running event loop" in str(e):
                loop = asyncio.get_event_loop()
                loop.run_until_complete(self._send_document_analysis_requests(target_paths))

    def excute_get_result_requests(self):
        try:
            nest_asyncio.apply()
            if not self.send_get_result_request_map:
                target_map = self.send_document_analysis_request_id_map
            else:
               target_paths = [key for key, value in self.send_get_result_request_map.items() if value is None]
               target_map = {key: value for key, value in self.send_document_analysis_request_id_map.items() if key in target_paths}
            asyncio.run(self._send_get_result_requests(target_map))
        except RuntimeError as e:
            if "asyncio.run() cannot be called from a running event loop" in str(e):
                loop = asyncio.get_event_loop()
                loop.run_until_complete(self._send_get_result_requests(target_paths))



In [50]:
guideline_file = FileManager(GUIDELINE_PATH)
splitted_guideline_pdfs = guideline_file.split_pages(split_size=1)

In [49]:
guideline_file.excute_analysis_requests()

INFO:__main__:Request 1 File path[../pdfs/splitted/guideline/guideline_023_023.pdf]: Request ID = ce1018b7-7401-4976-8616-6c6db6f19f50
INFO:__main__:Request 2 File path[../pdfs/splitted/guideline/guideline_031_031.pdf]: Request ID = 96283bf2-8bd0-4e00-9f88-abc97e465774
INFO:__main__:Request 3 File path[../pdfs/splitted/guideline/guideline_041_041.pdf]: Request ID = 66984b51-3f29-4524-a481-c76d07970584
INFO:__main__:Request 4 File path[../pdfs/splitted/guideline/guideline_077_077.pdf]: Request ID = 75c0494d-416e-47e9-bc61-05051ae06e03


In [56]:
guideline_file.excute_get_result_requests()

INFO:__main__:Get Result Request 1 File path[../pdfs/splitted/guideline/guideline_000_000.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 2 File path[../pdfs/splitted/guideline/guideline_001_001.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 3 File path[../pdfs/splitted/guideline/guideline_002_002.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 4 File path[../pdfs/splitted/guideline/guideline_003_003.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 5 File path[../pdfs/splitted/guideline/guideline_004_004.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 6 File path[../pdfs/splitted/guideline/guideline_005_005.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 7 File path[../pdfs/splitted/guideline/guideline_006_006.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 8 File path[../pdfs/splitted/guideline/guideline_007_007.pdf]: Successfully saved result.
INFO:__main__:Ge

In [51]:
api_specification_file = FileManager(API_SPEC_PATH)
splitted_api_specification_pdfs = api_specification_file.split_pages(split_size=1)

In [55]:
api_specification_file.excute_analysis_requests()

INFO:__main__:Request 1 File path[../pdfs/splitted/api_specification/api_specification_018_018.pdf]: Request ID = aee2fbad-3bf8-4b21-8233-ef9462151c8f
INFO:__main__:Request 2 File path[../pdfs/splitted/api_specification/api_specification_027_027.pdf]: Request ID = a61b4898-f4d4-4e87-b4da-ba160cad0cbe
INFO:__main__:Request 3 File path[../pdfs/splitted/api_specification/api_specification_033_033.pdf]: Request ID = cc1638ce-10ab-4c8c-876f-5fc0ad38acd6
INFO:__main__:Request 4 File path[../pdfs/splitted/api_specification/api_specification_060_060.pdf]: Request ID = 599b28e8-e322-4ba8-9ecb-9e3b955ea2b4
INFO:__main__:Request 5 File path[../pdfs/splitted/api_specification/api_specification_065_065.pdf]: Request ID = 8bf2ca87-2b54-42cd-9826-9ddffd98ce11
INFO:__main__:Request 6 File path[../pdfs/splitted/api_specification/api_specification_090_090.pdf]: Request ID = 9fed61a7-faf1-4869-ace9-9cfba1ababa8
INFO:__main__:Request 7 File path[../pdfs/splitted/api_specification/api_specification_096_096

In [57]:
api_specification_file.excute_get_result_requests()

INFO:__main__:Get Result Request 1 File path[../pdfs/splitted/api_specification/api_specification_000_000.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 2 File path[../pdfs/splitted/api_specification/api_specification_001_001.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 3 File path[../pdfs/splitted/api_specification/api_specification_002_002.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 4 File path[../pdfs/splitted/api_specification/api_specification_003_003.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 5 File path[../pdfs/splitted/api_specification/api_specification_004_004.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 6 File path[../pdfs/splitted/api_specification/api_specification_005_005.pdf]: Successfully saved result.
INFO:__main__:Get Result Request 7 File path[../pdfs/splitted/api_specification/api_specification_006_006.pdf]: Successfully saved result.
INFO:__main__:Get Result Re