In [11]:
from minio import Minio
from minio.error import S3Error
import os
from utils.config import *
import numpy as np

In [2]:
def get_minio_client() -> Minio:
    """Initialize and return a MinIO client."""
    return Minio(
        endpoint=MINIO_ENDPOINT,
        access_key=os.getenv("MINIO_ACCESS_KEY"),
        secret_key=os.getenv("MINIO_SECRET_KEY"),
        session_token=os.getenv("MINIO_SESSION_TOKEN"),
        secure=MINIO_SECURE
    )


client = get_minio_client()

In [9]:

def download_directory_from_minio(client, bucket_name, directory_prefix, local_directory):
    """
    Download all objects from a specific directory/prefix in a MinIO bucket
    
    Args:
        client: MinIO client instance
        bucket_name: Name of the bucket
        directory_prefix: Directory/prefix in the bucket to download
        local_directory: Local directory to save the files
    """
    # Ensure the local directory exists
    os.makedirs(local_directory, exist_ok=True)
    
    # List all objects in the directory
    objects = client.list_objects(bucket_name, prefix=directory_prefix, recursive=True)
    
    for obj in objects:
        # Get the object name and create the corresponding local path
        object_name = obj.object_name
        
        # Skip if it's a directory marker
        if object_name.endswith('/'):
            continue
            
        # Create relative path for local storage
        relative_path = object_name[len(directory_prefix):].lstrip('/')
        local_path = os.path.join(local_directory, relative_path)
        
        # Ensure the directory exists
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        # Download the object
        print(f"Downloading {object_name} to {local_path}")
        client.fget_object(bucket_name, object_name, local_path)
    
    print(f"Download complete. Files saved to {local_directory}")

def load_image_dataset(data_directory):
    """
    Load image dataset and prepare it for model training, separating train and test sets
    based on filename convention:
    - Train images: img_<number_id>.png
    - Test images: test_img_<number_id>.png
    
    Args:
        data_directory: Directory containing the dataset
    
    Returns:
        Tuple containing (X_train, y_train, X_test, y_test)
    """
    import os
    import numpy as np
    from PIL import Image
    
    train_images = []
    train_labels = []
    test_images = []
    test_labels = []
    
    class_dirs = sorted([d for d in os.listdir(data_directory) if os.path.isdir(os.path.join(data_directory, d))])
    
    # Create a mapping from class names to indices
    class_to_idx = {class_name: i for i, class_name in enumerate(class_dirs)}
    
    # Print class mapping for reference
    print("Class mapping:")
    for class_name, idx in class_to_idx.items():
        print(f"  {idx}: {class_name}")
    
    # Load images and labels
    for class_dir in class_dirs:
        class_path = os.path.join(data_directory, class_dir)
        class_idx = class_to_idx[class_dir]
        
        train_count = 0
        test_count = 0
        
        for image_file in os.listdir(class_path):
            if image_file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')):
                image_path = os.path.join(class_path, image_file)
                try:
                    img = Image.open(image_path).convert('RGB')
                    # Convert image to numpy array and normalize (adjust as needed)
                    img_array = np.array(img) / 255.0
                    
                    # Determine if this is a train or test image based on filename
                    if image_file.startswith('test_img_'):
                        test_images.append(img_array)
                        test_labels.append(class_idx)
                        test_count += 1
                    else:
                        train_images.append(img_array)
                        train_labels.append(class_idx)
                        train_count += 1
                        
                except Exception as e:
                    print(f"Error loading {image_path}: {e}")
        
        print(f"Class {class_idx} ({class_dir}): {train_count} training images, {test_count} test images")
    
    # Convert to numpy arrays
    X_train = np.array(train_images)
    y_train = np.array(train_labels)
    X_test = np.array(test_images)
    y_test = np.array(test_labels)
    
    print(f"Total dataset: {len(X_train)} training images, {len(X_test)} test images")
    print(f"Number of classes: {len(np.unique(y_train))}")
    
    return X_train, y_train, X_test, y_test




In [None]:
# usage
bucket_name = "innov-test-bucket"
minio_directory = "medmnist_prepared"
local_download_path = "./downloaded_data"

# Download the data from MinIO
download_directory_from_minio(client, bucket_name, minio_directory, local_download_path)



In [10]:
# Load the data for training
X_train, y_train, X_test, y_test = load_image_dataset(local_download_path)

Class mapping:
  0: class_0_adipose
  1: class_1_background
  2: class_2_debris
  3: class_3_lymphocytes
  4: class_4_mucus
  5: class_5_smooth_muscle
  6: class_6_normal_colon_mucosa
  7: class_7_cancer-associated_stroma
  8: class_8_colorectal_adenocarcinoma_epithelium
Class 0 (class_0_adipose): 56 training images, 26 test images
Class 1 (class_1_background): 56 training images, 8 test images
Class 2 (class_2_debris): 52 training images, 5 test images
Class 3 (class_3_lymphocytes): 70 training images, 9 test images
Class 4 (class_4_mucus): 38 training images, 15 test images
Class 5 (class_5_smooth_muscle): 80 training images, 5 test images
Class 6 (class_6_normal_colon_mucosa): 36 training images, 11 test images
Class 7 (class_7_cancer-associated_stroma): 48 training images, 5 test images
Class 8 (class_8_colorectal_adenocarcinoma_epithelium): 65 training images, 16 test images
Total dataset: 501 training images, 100 test images
Number of classes: 9
