In [None]:
import os
import h5py
import numpy as np
from tensorflow.keras import utils
import pylab as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from PIL import Image

Load the images and labels from dataset file

In [None]:
with h5py.File('Galaxy10.h5', 'r') as F:
    images = np.array(F['images'])
    labels = np.array(F['ans'])

In [None]:
print('Data loaded successfully')

Galaxy10 dataset (21785 images)<br>
├── Class 0: Disk, Face-on, No Spiral<br>
├── Class 1: Smooth, Completely round<br>
├── Class 2: Smooth, in-between round<br>
├── Class 3: Smooth, Cigar shaped<br>
├── Class 4: Disk, Edge-on, Rounded Bulge<br>
├── Class 5: Disk, Edge-on, Boxy Bulge<br>
├── Class 6: Disk, Edge-on, No Bulge<br>
├── Class 7: Disk, Face-on, Tight Spiral<br>
├── Class 8: Disk, Face-on, Medium Spiral<br>
└── Class 9: Disk, Face-on, Loose Spiral

In [None]:
class_names = ['Disk-Face_on-No_Spiral', 'Smooth-Completely-round', 'Smooth-in_between-round', 
               'Smooth-Cigar_shaped', 'Disk-Edge_on-Rounded_Bulge', 'Disk-Edge_on-Boxy_Bulge', 
               'Disk-Edge_on-No_Bulge', 'Disk-Face_on-Tight_Spiral', 'Disk-Face_on-Medium_Spiral', 
               'Disk-Face_on-Loose_Spiral']

Select 10 of the images to inspect

In [None]:
img = None
plt.ion()

Random number in dataset index range

In [None]:
samples = np.random.randint(0, labels.shape[0], size=10)
print('Images to display:', samples)
for i in samples:
    img = plt.imshow(images[i])
    plt.title('Image {}: Class {}'.format(i, labels[i]))
    plt.draw()
    plt.pause(2.)
plt.close('all')

To convert the labels to categorical 10 classes

In [None]:
labels_cat = utils.to_categorical(labels, 10)

In [None]:
print('Categorical label:', labels_cat[0])
print('Shape of data structure labels {} and images {}'.format(labels_cat.shape, images.shape))

Slight data exploration

In [None]:
print('Dataset:', np.sum(labels_cat, axis=0))

Split the dataset into training and testing set (RANDOMLY)

In [None]:
train_idx, test_idx = train_test_split(np.arange(labels_cat.shape[0]), test_size=0.2)
train_labels = labels_cat[train_idx]
test_labels = labels_cat[test_idx]
print('Train:', np.sum(train_labels, axis=0))
print('Test:', np.sum(test_labels, axis=0))

Partitions keeping classes ratio - Stratified partitions

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

In [None]:
for train_idx, test_idx in sss.split(labels, labels):
    train_labels = labels_cat[train_idx]
    test_labels = labels_cat[test_idx]
    print("TRAIN:", train_idx, "number of objects:", train_idx.shape)
    print(np.sum(train_labels, axis=0))
    print("TEST:", test_idx, "number of objects:", test_idx.shape)
    print(np.sum(test_labels, axis=0))

In [None]:
train_labels = labels[train_idx]
test_labels = labels[test_idx]
train_images = images[train_idx]
test_images = images[test_idx]

In [None]:
main = os.getcwd()
# Create directory tree and save images
dataset_dirname = 'Galaxy10'

In [None]:
try:
    os.mkdir(dataset_dirname)
except OSError:
    print('OSError: Creating or already exists the directory')

Main dataset folder

In [None]:
os.chdir(dataset_dirname)

Train partition

In [None]:
os.mkdir('train')
os.chdir('train')
for cls in class_names:   # for each class 'cls'
    os.mkdir(cls)
    # train/<class> folder save images
    cls_int = class_names.index(cls)
    print('Train - Class: ', cls)
    for i in range(len(train_labels)):   # traverse all train lavels
        if train_labels[i] == cls_int:   # save instance 'i' belong to class 'cls'
            img_path = os.path.join(os.getcwd(), cls)
            #print('Save image {} in {}'.format(i, img_path))
            img = Image.fromarray(train_images[i])
            img.save(os.path.join(img_path, 'galaxy10_img{}.jpg'.format(i)))

Validation partition

In [None]:
os.chdir(os.path.join(main, dataset_dirname))
os.mkdir('val')
os.chdir('val')
for cls in class_names:
    os.mkdir(cls)
    # val/<class> folder save images
    cls_int = class_names.index(cls)
    print('Validation - Class: ', cls)
    for i in range(len(test_labels)):   # traverse all train lavels
        if test_labels[i] == cls_int:   # save instance 'i' belong to class 'cls'
            img_path = os.path.join(os.getcwd(), cls)
            #print('Save image {} in {}'.format(i, img_path))
            img = Image.fromarray(test_images[i])
            img.save(os.path.join(img_path, 'galaxy10_img{}.jpg'.format(i)))

In [None]:
os.chdir(main)