In [92]:
 
########################## first part: prepare data ###########################
from random import shuffle
import glob
 
shuffle_data = True  # shuffle the addresses
 
hdf5_path = 'datasets/data_test.h5'  # file path for the created .hdf5 file
cat_dog_train_path = 'datasets/test/*.jpg' # the original data path
 
# get all the image paths
addrs = glob.glob(cat_dog_train_path)
 
# label the data as 0=jalakbali, 1=komodo
labels = []
for addr in addrs:
    if 'komodo' in addr:
        labels.append(1)
    elif'jalakbali' in addr:
        labels.append(0)
    else : 
        labels.append(2)
         
# shuffle data
if shuffle_data:
    c = list(zip(addrs, labels)) # use zip() to bind the images and labels together
    shuffle(c)
    (addrs, labels) = zip(*c)  # *c is used to separate all the tuples in the list c,  
                               # "addrs" then contains all the shuffled paths and
                               # "labels" contains all the shuffled labels.
                               
# # Divide the data into 80% for train and 20% for test

test_addrs = addrs[0:int(0.8*len(addrs)):]
test_labels = labels[0:int(0.8*len(labels)):]


In [93]:

 ##################### second part: create the h5py object #####################
import numpy as np
import h5py
 
test_shape = (len(test_addrs), 128, 128, 3)
 
# open a hdf5 file and create earrays
f = h5py.File(hdf5_path, mode='w')
 
# PIL.Image: the pixels range is 0-255,dtype is uint.
# matplotlib: the pixels range is 0-1,dtype is float.
f.create_dataset("test_set_x", test_shape, np.uint8)  

animal = ['jalakbali', 'komodo','bukan binatang langka'];
f.create_dataset("animal", data=animal)
 
# the ".create_dataset" object is like a dictionary, the "train_labels" is the key.
f.create_dataset("test_set_y", (len(test_addrs),), np.uint8)
f["test_set_y"][...] = test_labels
 

In [94]:
######################## third part: write the images #########################
import cv2
 
# loop over train paths
for i in range(len(test_addrs)):
    
    if i % 1000 == 0 and i > 1:
        print ('Test data: {}/{}'.format(i, len(test_addrs)) )
 
    addr = test_addrs[i]
    img = cv2.imread(addr)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # cv2 load images as BGR, convert it to RGB
    img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_CUBIC)# resize to (128,128)    
    f["test_set_x"][i, ...] = img[None]
 
    
f.close()