In [4]:
import os
import cv2
import numpy as np

# ---- Single folder loader ----
def load_images_from_folder(folder, flatten=True):
    """
    Loads images from a folder.
    - folder: path to NORMAL/PNEUMONIA structure
    - flatten: if True, returns 1D arrays; if False, returns (H, W, 1) for CNNs
    """
    images = []
    labels = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(root, file)

                # Label based on folder name
                label_name = os.path.basename(os.path.dirname(image_path))
                label = 1 if label_name.upper() == "PNEUMONIA" else 0

                # Read grayscale
                img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
                if img is None:
                    print(f"Could not read {image_path}")
                    continue

                # Resize to 64x64
                img_resized = cv2.resize(img, (64, 64))

                if flatten:
                    img_data = img_resized.flatten()
                else:
                    img_data = img_resized.reshape(64, 64, 1)  # For CNNs

                images.append(img_data)
                labels.append(label)

    return np.array(images), np.array(labels)


# ---- Automatic dataset loader ----
def load_dataset(base_folder, flatten=True):
    """
    Loads train, test, val datasets automatically.
    - base_folder: main chest_xray folder (can be "chest_xray" or "chest_xray/chest_xray")
    - flatten: if True, returns flattened images; if False, keeps shape for CNNs
    """
    # Fix for double folder
    double_folder = os.path.join(base_folder, "chest_xray")
    if os.path.isdir(double_folder):
        base_folder = double_folder
    X_train, y_train = load_images_from_folder(os.path.join(base_folder, "train"), flatten)
    X_test, y_test = load_images_from_folder(os.path.join(base_folder, "test"), flatten)
    X_val, y_val = load_images_from_folder(os.path.join(base_folder, "val"), flatten)

    return X_train, y_train, X_test, y_test, X_val, y_val


# ---- Example usage ----
if __name__ == "__main__":
    base_path = "chest_xray"  # Works with or without the extra /chest_xray

    # Flattened for ML models like SVM, logistic regression
    X_train, y_train, X_test, y_test, X_val, y_val = load_dataset(base_path, flatten=True)
    print("Flattened Train set:", X_train.shape, y_train.shape)

    # Non-flattened for CNN models
    X_train_cnn, y_train_cnn, X_test_cnn, y_test_cnn, X_val_cnn, y_val_cnn = load_dataset(base_path, flatten=False)
    print("CNN Train set:", X_train_cnn.shape, y_train_cnn.shape)

Flattened Train set: (0,) (0,)
CNN Train set: (0,) (0,)
