In [12]:
import os
import boto3
from loguru import logger
from botocore.client import Config
from botocore.client import ClientError


class MinioFileStorage:
    def __init__(self):
        self._minio_client = None
        self._created_buckets = set()

    @property
    def minio_client(self) -> boto3.client:
        if self._minio_client is None:
            self._minio_client = boto3.client(
                "s3",
                endpoint_url=os.getenv("MINIO_ENDPOINT_URL", "http://0.0.0.0:9000"),
                aws_access_key_id=os.getenv(
                    "MINIO_ACCESS_KEY_ID", "HoEALUjdVOQpyUo8tF0X"
                ),
                aws_secret_access_key=os.getenv(
                    "MINIO_SECRET_ACCESS_KEY",
                    "NvotOjyK3zkeLXy8O3Lnf2SjgWtVAFIni2xL9809",
                ),
                config=Config(signature_version="s3v4"),
            )
        return self._minio_client

    def _create_bucket(self, bucket_name: str):
        if bucket_name in self._created_buckets:
            return

        try:
            self.minio_client.head_bucket(Bucket=bucket_name)
            self._created_buckets.add(bucket_name)
        except ClientError as e:
            logger.error(e)
            logger.info(f"{bucket_name} is not exists. It will be created.")
            self.minio_client.create_bucket(Bucket=bucket_name)

    def upload_file(
        self,
        bucket_name: str,
        filename: str,
        file_bytes: bytes,
        file_content_type: str,
    ) -> str:
        self._create_bucket(bucket_name=bucket_name)
        self.minio_client.put_object(
            Bucket=bucket_name,
            Key=filename,
            Body=file_bytes,
            ContentType=file_content_type,
        )

    def is_exist(self, bucket_name: str, file_name: str) -> bool:
        self._create_bucket(bucket_name=bucket_name)
        try:
            self.minio_client.head_object(Bucket=bucket_name, Key=file_name)
            return True
        except ClientError as e:
            return False

    def get_file(self, bucket_name: str, file_name: str) -> bytes:
        self._create_bucket(bucket_name=bucket_name)
        try:
            response = self.minio_client.get_object(Bucket=bucket_name, Key=file_name)
            return response['Body'].read()
        except ClientError as e:
            logger.error(f"Failed to retrieve file '{file_name}': {e}")
            raise FileNotFoundError(f"File '{file_name}' not found in bucket '{bucket_name}'.")


In [14]:
client = MinioFileStorage()

In [53]:
summary_bytes = client.get_file("fda-summary", "K011768-summary.pdf")

In [31]:
! uv pip install pdf2image pytesseract

[2mUsing Python 3.11.8 environment at: /Users/furkanmelih/personal_projects/complizen_case_project/.venv[0m
[2K[2mResolved [1m4 packages[0m [2min 910ms[0m[0m                                         [0m
[2K[2mPrepared [1m2 packages[0m [2min 94ms[0m[0m                                              
[2K[2mInstalled [1m3 packages[0m [2min 18ms[0m[0m                                [0m
 [32m+[39m [1mpdf2image[0m[2m==1.17.0[0m
 [32m+[39m [1mpillow[0m[2m==11.2.1[0m
 [32m+[39m [1mpytesseract[0m[2m==0.3.13[0m


In [55]:
import pytesseract
from io import BytesIO
from pdf2image import convert_from_bytes

def extract_text_from_image_pdf(pdf_bytes: bytes):
    images = convert_from_bytes(pdf_bytes)
    extracted_text = "\n".join(
        [
            pytesseract.image_to_string(image)
            for image in images
        ]
    )
    return extracted_text


In [56]:
pdf_string = extract_text_from_image_pdf(summary_bytes)

In [57]:
print(pdf_string)

 KOuTqey
499 Houte 30, Imperial, PA 15126
(724) 695-1890 Fax (724) 695-1692 >
Toll Free No. 1-800-633-8080 AUG - 3 2001 Page 6-1

Specialists in precise temperature control 510(K)
Summary

510 (K) Summary

We at Thermo-EFlectric Company believe our Thermo-Therapy products (Models TT-101, TT-
201, and TT-202) are virtually identical to the equivalent sized I’ luidotherapy models — with the
exception that our TT-101 equivalent to Fluidotherapy Model T-11, although the same in size
and cellulose capacity, is For Professional Use Only — not home use — and the cautions are so
provided.

This 510 (K) is being submitted by:

Thermo-Electric Company
455 Route 30
Imperial, PA 15126
Telephone Number (724) 695-1890
Fax Number (724) 695-1892

Lawrence FE. Madson, Jr. — June 1, 2001
Thermo-Electric Company is an Ohio Corporation having been in business since 1924 and is also
the manufacture of many other products for the use in the professional physical and occupational

therapy market.

The substa

In [58]:
# Define regex pattern for K numbers
k_number_pattern = r'\bK\d{6}\b'
k_numbers = re.findall(k_number_pattern, pdf_string)

sorted(set(k_numbers))

['K011768', 'K871802', 'K896817']

In [None]:
K011768