In [39]:
from blob_controller import blob_controller
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [43]:
import os
from typing import Union, IO, Optional
from azure.core.exceptions import ResourceExistsError
from azure.storage.blob import BlobServiceClient, BlobClient, ContentSettings
from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
class blob_controller:
    def __init__(self, conn: str, container: str):
        if not conn:
            raise ValueError("Azure connection string이 비어 있습니다.")
        if not container or container.lower() != container:
            raise ValueError("컨테이너 이름은 소문자여야 합니다. (예: 'meritz-data')")

        self.conn = conn
        self.container = container
        self.blob_service = BlobServiceClient.from_connection_string(conn)
        self.container_client = self.blob_service.get_container_client(container)

        # 컨테이너 없으면 생성, 있으면 무시
        self.ensure_container_exists()

    def ensure_container_exists(self):
        try:
            self.container_client.create_container()
        except ResourceExistsError:
            pass 


    def upload_pdf_to_blob(self, file_path: str) -> str:
        """
        로컬 PDF 파일을 업로드. blob_name을 넘기지 않으면  basename으로 업로드.
        """
        name = os.path.basename(file_path)
        blob_client: BlobClient = self.container_client.get_blob_client(name)
        with open(file_path, "rb") as f:
            blob_client.upload_blob(
                f,
                overwrite=True,
                max_concurrency=1,
                timeout=600,
                content_settings=ContentSettings(content_type="application/pdf"),
            )
        return name
    
    
    def list_files(self):
        """
        특정 prefix 하위의 blob 목록 반환
        """
        blobs = self.container_client.list_blobs()
        file_list = [b.name for b in blobs]
        return file_list
    
    def download_to_temp(self, blob_name: str, tmp_dir: Path) -> Path:
        """
        blob_name(컨테이너 내 경로)을 tmp_dir에 파일로 저장하고 그 경로 반환
        """
        tmp_dir.mkdir(parents=True, exist_ok=True)           
        local_path = tmp_dir / Path(blob_name).name

        bc: BlobClient = self.container_client.get_blob_client(blob_name)  
        try:
            stream = bc.download_blob(max_concurrency=4)    
            with open(local_path, "wb") as f:
                for chunk in stream.chunks():
                    f.write(chunk)
        except ResourceNotFoundError:
            raise FileNotFoundError(f"Blob not found: {blob_name}")

        return local_path

In [44]:
os.getenv("azure-blob-container-name")

'choiblob'

In [45]:
bc = blob_controller(
    conn = os.getenv("azure-blob-connection-string"),
    container = 'pdf'
)

In [48]:
from tempfile import TemporaryDirectory
from pathlib import Path

# choiblob 컨테이너에 hello.txt 가 있다고 가정
with TemporaryDirectory() as td:
    path = bc.download_to_temp("hello.txt", Path(td))
    print("📂 다운로드 경로:", path)

    # 파일 내용 출력
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()
    print("📄 파일 내용:\n", content)

📂 다운로드 경로: /var/folders/4v/kg2p6sjs1wz11bbmc45pqxt40000gn/T/tmpcfkbnizm/hello.txt
📄 파일 내용:
 안녕하세요, 업로드 테스트입니다.



In [33]:
blob_client: BlobClient = bc.container_client.get_blob_client(blob='pdf/hello.txt')

# blob_client.download_blob(timeout=600).readall()

In [28]:
blob_client: BlobClient = bc.container_client.get_blob_client("pdf")
stream = blob_client.download_blob(timeout=600)
with open("test.txt", "wb") as f:
    f.write(stream.readall())

ResourceNotFoundError: The specified blob does not exist.
RequestId:1bcdccde-901e-004f-2126-44fa33000000
Time:2025-10-23T14:07:25.3906531Z
ErrorCode:BlobNotFound
Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>BlobNotFound</Code><Message>The specified blob does not exist.
RequestId:1bcdccde-901e-004f-2126-44fa33000000
Time:2025-10-23T14:07:25.3906531Z</Message></Error>

In [25]:
bc.list_files()

['hello.txt']

In [None]:
bc.download_to_temp()

In [12]:
os.path.basename("/Users/jaehyeokchoi/Desktop/table_mag/table_dataset_pipeline/hello.txt")

'hello.txt'

In [49]:
# 업로드
uploaded_path = bc.upload_pdf_to_blob("/Users/jaehyeokchoi/Desktop/table_mag/table_dataset_pipeline/hello.txt")
print("uploaded blob path (in container):", uploaded_path)
# 예: pdf/hello.txt

# 검증: 방금 올린 blob의 속성 읽기
props = bc.container_client.get_blob_client(uploaded_path).get_blob_properties()
print("exists size:", props.size, "last_modified:", props.last_modified)

print("list under prefix:")
for b in bc.container_client.list_blobs(name_starts_with="pdf/"):
    print(" -", b.name)


uploaded blob path (in container): hello.txt
exists size: 47 last_modified: 2025-10-23 14:16:51+00:00
list under prefix:


In [10]:
print("list under prefix:")
for b in bc.container_client.list_blobs(name_starts_with="pdf/"):
    print(" -", b.name)

list under prefix:
 - pdf/hello.txt


In [4]:
blob_client.upload_pdf_to_blob(file_path="/Users/jaehyeokchoi/Desktop/table_mag/table_dataset_pipeline/hello.txt")

pdf/hello.txt


'pdf/hello.txt'