In [1]:
import os
import h5py
import numpy as np
import cv2
import json
from sklearn.model_selection import train_test_split

def mat_to_dataset(mat_path):
    """
    Convert digitStruct.mat file into a dataset of image names and bounding box details.
    """
    f = h5py.File(mat_path, mode='r')
    datasets = {}
    files_count = len(f['digitStruct']['name'])
    for i in range(files_count):
        # Decode the name
        name_ref = f['digitStruct']['name'][i, 0]
        name_array = f[name_ref][:]
        name = ''.join(chr(int(n)) for n in name_array)  # Properly decode the name
        
        # Extract bounding box data
        bbox = {}
        box_ref = f['digitStruct']['bbox'][i, 0]
        box_data = f[box_ref]
        length = box_data['label'].shape[0] if box_data['label'].ndim > 0 else 1
        
        for key in ['height', 'label', 'left', 'top', 'width']:
            if length > 1:
                bbox[key] = [int(f[box_data[key][j, 0]][0, 0]) for j in range(length)]
            else:
                bbox[key] = [int(box_data[key][0, 0])]
        
        datasets[name] = bbox
        print(f'Processed {i + 1} / {files_count} images.\r', end='') 
    
    print(f'\n{files_count} records loaded.')
    return datasets


def preprocess_images_and_labels(mat_file_path, image_folder, target_size=(32, 32)):
    """
    Load image paths, process images, and extract bounding boxes from .mat file.
    """
    # Parse .mat file
    dataset = mat_to_dataset(mat_file_path)
    images, labels = [], []
    for image_name, bbox in dataset.items():
        # Load image
        image_path = os.path.join(image_folder, image_name)
        image = cv2.imread(image_path)
        if image is None:
            print(f"Warning: Could not read {image_path}")
            continue

        # Process each bounding box in the image
        for i in range(len(bbox['label'])):
            x = max(0, int(bbox['left'][i]))  # Ensure x is non-negative
            y = max(0, int(bbox['top'][i]))   # Ensure y is non-negative
            w = max(1, int(bbox['width'][i])) # Ensure width is at least 1
            h = max(1, int(bbox['height'][i]))# Ensure height is at least 1
            
            # Clip the coordinates to fit within the image dimensions
            x_end = min(x + w, image.shape[1])  # Ensure x_end is within width
            y_end = min(y + h, image.shape[0])  # Ensure y_end is within height
            
            # Check if the cropped area is valid
            if x_end <= x or y_end <= y:
                print(f"Skipping invalid bounding box: x={x}, y={y}, w={w}, h={h}")
                continue

            cropped = image[y:y_end, x:x_end]  # Crop the image
            if cropped.size == 0:  # Check if cropped image is empty
                print(f"Empty cropped image: x={x}, y={y}, w={w}, h={h}")
                continue

            resized = cv2.resize(cropped, target_size)
            images.append(resized)

            # Handle label: Map "10" to "0"
            label = int(bbox['label'][i])
            if label == 10:
                label = 0
            labels.append(label)

    images = np.array(images).astype('float32') / 255.0  # Normalize
    labels = np.array(labels).astype('int')
    return images, labels


def split_and_save_data(images, labels, output_folder):
    """
    Split data into train/validation/test sets and save them into separate folders.
    """
    # Split the data
    X_train, X_temp, y_train, y_temp = train_test_split(images, labels, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Save data
    def save_images_and_labels(subset, subset_name):
        subset_folder = os.path.join(output_folder, subset_name)
        os.makedirs(subset_folder, exist_ok=True)
        for i, (image, label) in enumerate(zip(*subset)):
            label_folder = os.path.join(subset_folder, str(label))
            os.makedirs(label_folder, exist_ok=True)
            image_path = os.path.join(label_folder, f"{i}.png")
            cv2.imwrite(image_path, (image * 255).astype(np.uint8))

    # Save the training, validation, and test sets
    save_images_and_labels((X_train, y_train), "train")
    save_images_and_labels((X_val, y_val), "val")
    save_images_and_labels((X_test, y_test), "test")

    print(f"Data saved in {output_folder}")

def main():
    """
    Main function to process and save SVHN data.
    """
    mat_file_path = "/Users/maximilianstumpf/Desktop/archive/train_digitStruct.mat"
    image_folder = "/Users/maximilianstumpf/Desktop/archive/train/train"
    output_folder = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Processed'

    # Preprocess images and labels
    images, labels = preprocess_images_and_labels(mat_file_path, image_folder)

    # Split data and save
    split_and_save_data(images, labels, output_folder)

if __name__ == "__main__":
    main()

  name = ''.join(chr(int(n)) for n in name_array)  # Properly decode the name


Processed 33402 / 33402 images.
33402 records loaded.
Data saved in /Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Processed
