In [1]:
from blob_controller import blob_controller
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import io
import mimetypes
import tempfile
from pathlib import Path

from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError

from azure.storage.blob import (
    BlobServiceClient,
    ContainerClient,
    BlobClient,
    generate_blob_sas,
    BlobSasPermissions,
    ContentSettings,
)
import fitz

class blob_controller:
    def __init__(self, conn: str, container: str):
        if not conn:
            raise ValueError("Azure connection string이 비어 있습니다.")
        if not container or container.lower() != container:
            raise ValueError("컨테이너 이름은 소문자여야 합니다. (예: 'meritz-data')")

        self.conn = conn
        self.container = container
        self.blob_service = BlobServiceClient.from_connection_string(conn)
        self.container_client = self.blob_service.get_container_client(container)

        # 컨테이너 없으면 생성, 있으면 무시
        self.ensure_container_exists()

    def ensure_container_exists(self):
        try:
            self.container_client.create_container()
        except ResourceExistsError:
            pass 


    def upload_pdf_to_blob(self, file_path: str) -> str:
        """
        로컬 PDF 파일을 업로드. blob_name을 넘기지 않으면  basename으로 업로드.
        """
        name = os.path.basename(file_path)
        blob_client: BlobClient = self.container_client.get_blob_client(name)
        with open(file_path, "rb") as f:
            blob_client.upload_blob(
                f,
                overwrite=True,
                max_concurrency=3,
                timeout=600,
                content_settings=ContentSettings(content_type="application/pdf"),
            )
        return name
    
    
    def list_files(self):
        """
        특정 prefix 하위의 blob 목록 반환
        """
        blobs = self.container_client.list_blobs()
        file_list = [b.name for b in blobs]
        return file_list
    
    
    def download_to_temp(self, blob_name: str, tmp_dir: Path | str) -> Path:
        """
        blob_name(컨테이너 내 경로)을 tmp_dir에 파일로 저장하고 그 경로(Path)를 반환.
        tmp_dir은 str 또는 Path 모두 허용.
        """
        # str 이 들어와도 Path 로 정규화
        tmp_dir = Path(tmp_dir)
        tmp_dir.mkdir(parents=True, exist_ok=True)

        local_path = tmp_dir / Path(blob_name).name

        bc: BlobClient = self.container_client.get_blob_client(blob_name)
        try:
            stream = bc.download_blob(max_concurrency=4)
            with open(local_path, "wb") as f:
                for chunk in stream.chunks():
                    if chunk:
                        f.write(chunk)
        except ResourceNotFoundError:
            raise FileNotFoundError(f"Blob not found: {blob_name}")

        return local_path
    
    
    
    def open_pdf_from_blob_stream(self, blob_name: str):
        """
        디스크 임시파일 없이 Blob을 스트리밍으로 읽어 메모리/스풀 버퍼에 적재한 뒤,
        fitz.open(stream=..., filetype="pdf")로 바로 연다.
        - 작은/중간 사이즈 PDF: 메모리에서 처리
        - 큰 PDF: SpooledTemporaryFile 이 자동으로 디스크로 스필오버
        """
        spooled = tempfile.SpooledTemporaryFile(max_size=64 * 1024 * 1024)  # 64MB까지 메모리, 초과 시 디스크 스필오버
        try:
            blob_client = self.container_client.get_blob_client(blob_name)
            stream = blob_client.download_blob(max_concurrency=4)
            for chunk in stream.chunks():
                if chunk:
                    spooled.write(chunk)
            spooled.seek(0)
            data = spooled.read()
            return fitz.open(stream=data, filetype="pdf")
        finally:
            try:
                spooled.close()
            except Exception:
                pass

In [3]:
os.getenv("azure-blob-container-name")

'choiblob'

In [32]:
data_bc = blob_controller(
    conn = os.getenv("azure-blob-connection-string"),
    container = 'pdf'
)

img_bc = blob_controller(conn= os.getenv("azure-blob-connection-string"), container='image')            # PNG
md_bc = blob_controller(conn= os.getenv("azure-blob-connection-string"), container='markdown')          # MD
hist_bc = blob_controller(conn= os.getenv("azure-blob-connection-string"), container='history')     
        

In [34]:
from pathlib import Path
import csv
import io
import os
import time
from datetime import datetime
from tempfile import TemporaryDirectory
import tempfile

In [35]:
blob_name = data_bc.list_files()[0]
download_dir = "./"

In [36]:
blobs = data_bc.list_files()
blob_name = blobs[0]


doc = data_bc.open_pdf_from_blob_stream(blob_name)
base = os.path.splitext(os.path.basename(blob_name))[0]




In [37]:
# for i in range(len(doc)):
i=7
page = doc.load_page(i)


table_finder = page.find_tables()
tables = table_finder.tables if hasattr(table_finder, "tables") else []

print(tables)

[<pymupdf.table.Table object at 0x1073e79e0>]


In [38]:
page_num = i
import logging

for idx, t in enumerate(tables, start=1):
    table_rect = fitz.Rect(t.bbox)

    # 파일명 (표 단위)
    png_name = f"{base}_p{page_num + 1}_t{idx}.png"
    md_name  = f"{base}_p{page_num + 1}_t{idx}.md"
    png_path = download_dir / png_name
    md_path  = download_dir / md_name

    # PNG 생성 (표 bbox만 clip)
    try:
        pix = page.get_pixmap(dpi=300, clip=table_rect)
        pix.save(str(png_path))
        png_ok = True
    except Exception as e:
        logging.warning(f"PNG 생성 실패({blob_name} p{page_num+1} t{idx}): {e}")
        png_ok = False

    # MD 생성 (해당 표만)
    try:
        md_text = t.to_markdown()
        md_path.write_text(md_text, encoding="utf-8")
        md_ok = True
    except Exception as e:
        logging.warning(f"MD 생성 실패({blob_name} p{page_num+1} t{idx}): {e}")
        md_ok = False

    # 둘 다 성공시에만 업로드 + history 반영
    if png_ok and md_ok:
        # PNG 업로드 (image 컨테이너)
        png_bc = img_bc.container_client.get_blob_client(png_name)
        with open(png_path, "rb") as f:
            png_bc.upload_blob(
                f,
                overwrite=True,
                max_concurrency=3,
                timeout=600,
                content_settings=ContentSettings(content_type="image/png"),
            )

        # MD 업로드 (markdown 컨테이너)
        md_blob = md_bc.container_client.get_blob_client(md_name)
        with open(md_path, "rb") as f:
            md_blob.upload_blob(
                f,
                overwrite=True,
                max_concurrency=3,
                timeout=600,
                content_settings=ContentSettings(content_type="text/markdown"),
            )

        # history.csv 업데이트 (없으면 생성)
        history_blob = "history.csv"
        hist_client = hist_bc.container_client.get_blob_client(history_blob)

        header = ["original_pdf", "png_name", "md_name", "page", "category", "table_index"]
        rows = []
        try:
            existing = hist_client.download_blob(max_concurrency=2).readall().decode("utf-8")
            reader = csv.reader(existing.splitlines())
            rows.extend(list(reader))
            if not rows or rows[0] != header:
                rows.insert(0, header)  # 헤더 보정
        except ResourceNotFoundError:
            rows.append(header)

        rows.append([blob_name, png_name, md_name, str(page_num + 1), "meritz", str(idx)])

        # CSV 재생성 후 업로드(덮어쓰기)
        byte_buf = io.BytesIO()
        text_wr = io.TextIOWrapper(byte_buf, encoding="utf-8", newline="")
        writer = csv.writer(text_wr, quoting=csv.QUOTE_ALL)
        for r in rows:
            writer.writerow(r)
        text_wr.flush()
        byte_buf.seek(0)

        hist_client.upload_blob(
            byte_buf.getvalue(),
            overwrite=True,
            max_concurrency=2,
            timeout=600,
            content_settings=ContentSettings(content_type="text/csv"),
        )
        text_wr.close()
        byte_buf.close()

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [25]:
pix = page.get_pixmap(dpi=300, clip=union_rect)
pix.save(str(png_path))
png_ok = True

In [27]:
md_parts = []
for idx, t in enumerate(tables, start=1):
    try:
        md_parts.append(t.to_markdown())
    except Exception as e:
        md_parts.append(f"> Error converting table {idx}: {e}")
md_text = "\n\n---\n\n".join(md_parts)
md_path.write_text(md_text, encoding="utf-8")
md_ok = True

AttributeError: 'str' object has no attribute 'write_text'

In [12]:
base

'메리츠다이렉트(인터넷)업무용자동차보험약관'

In [9]:
len(doc)

170

In [6]:
blobs

['메리츠다이렉트(인터넷)업무용자동차보험약관.pdf']

In [10]:
local_pdf_path = data_bc.download_to_temp(blob_name, download_dir)

AttributeError: 'str' object has no attribute 'mkdir'

In [48]:
from tempfile import TemporaryDirectory
from pathlib import Path

# choiblob 컨테이너에 hello.txt 가 있다고 가정
with TemporaryDirectory() as td:
    path = bc.download_to_temp("hello.txt", Path(td))
    print("📂 다운로드 경로:", path)

    # 파일 내용 출력
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()
    print("📄 파일 내용:\n", content)

📂 다운로드 경로: /var/folders/4v/kg2p6sjs1wz11bbmc45pqxt40000gn/T/tmpcfkbnizm/hello.txt
📄 파일 내용:
 안녕하세요, 업로드 테스트입니다.



In [33]:
blob_client: BlobClient = bc.container_client.get_blob_client(blob='pdf/hello.txt')

# blob_client.download_blob(timeout=600).readall()

In [28]:
blob_client: BlobClient = bc.container_client.get_blob_client("pdf")
stream = blob_client.download_blob(timeout=600)
with open("test.txt", "wb") as f:
    f.write(stream.readall())

ResourceNotFoundError: The specified blob does not exist.
RequestId:1bcdccde-901e-004f-2126-44fa33000000
Time:2025-10-23T14:07:25.3906531Z
ErrorCode:BlobNotFound
Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>BlobNotFound</Code><Message>The specified blob does not exist.
RequestId:1bcdccde-901e-004f-2126-44fa33000000
Time:2025-10-23T14:07:25.3906531Z</Message></Error>

In [25]:
bc.list_files()

['hello.txt']

In [None]:
bc.download_to_temp()

In [12]:
os.path.basename("/Users/jaehyeokchoi/Desktop/table_mag/table_dataset_pipeline/hello.txt")

'hello.txt'

In [49]:
# 업로드
uploaded_path = bc.upload_pdf_to_blob("/Users/jaehyeokchoi/Desktop/table_mag/table_dataset_pipeline/hello.txt")
print("uploaded blob path (in container):", uploaded_path)
# 예: pdf/hello.txt

# 검증: 방금 올린 blob의 속성 읽기
props = bc.container_client.get_blob_client(uploaded_path).get_blob_properties()
print("exists size:", props.size, "last_modified:", props.last_modified)

print("list under prefix:")
for b in bc.container_client.list_blobs(name_starts_with="pdf/"):
    print(" -", b.name)


uploaded blob path (in container): hello.txt
exists size: 47 last_modified: 2025-10-23 14:16:51+00:00
list under prefix:


In [10]:
print("list under prefix:")
for b in bc.container_client.list_blobs(name_starts_with="pdf/"):
    print(" -", b.name)

list under prefix:
 - pdf/hello.txt


In [4]:
blob_client.upload_pdf_to_blob(file_path="/Users/jaehyeokchoi/Desktop/table_mag/table_dataset_pipeline/hello.txt")

pdf/hello.txt


'pdf/hello.txt'