In [1]:
# Prepare data and save it as hdf5 file to be trained on

In [2]:
from glob import glob
from random import shuffle
import cv2
import numpy as np
import h5py

In [3]:
DATADIR = "Dataset/"
DATA_FILE = 'dataset.h5'
IMG_SIZE = 400
IMG_CHANNELS=3
TRAIN_TEST_RATIO = 0.2

In [4]:
# Grab folder names

labels = glob(DATADIR+'*/')
print(labels)

['Dataset/consul/', 'Dataset/nestlé/', 'Dataset/brastemp/', 'Dataset/pampers/', 'Dataset/pantene/', 'Dataset/dell/', 'Dataset/gillette/']


In [5]:
# addrs : list of tuples (img_addr, label)

addrs = []

for (index, path) in enumerate(labels):
    img_addrs = glob(path+"*")
    for img_addr in img_addrs:
        addrs.append((img_addr, index))
        
shuffle(addrs)

In [6]:
# Test and train addrs

division = int(len(addrs) * TRAIN_TEST_RATIO)
train_addrs = addrs[division:]
test_addrs = addrs[0:division]

In [7]:
# Create hdf5 file

train_shape = (len(train_addrs), IMG_SIZE, IMG_SIZE, IMG_CHANNELS)
test_shape = (len(test_addrs), IMG_SIZE, IMG_SIZE, IMG_CHANNELS)

hdf5_file = h5py.File(DATA_FILE, mode='w')
hdf5_file.create_dataset('x_train', train_shape, np.uint8, compression="gzip")
hdf5_file.create_dataset('x_test', test_shape, np.uint8, compression="gzip")
hdf5_file.create_dataset('y_train', (len(train_addrs),), np.uint8 )
hdf5_file.create_dataset('y_test', (len(test_addrs),), np.uint8 )

<HDF5 dataset "y_test": shape (427,), type "|u1">

In [8]:
# Read images and save them

# Train images
for i in range(len(train_addrs)):
    if i%200 == 0 and i > 1 :
        print(f"Train: Done {i} of {len(train_addrs)}")
    
    addr = train_addrs[i][0]
    label = train_addrs[i][1]
    img = cv2.imread(addr)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_CUBIC )
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    hdf5_file['x_train'][i, ...] = img
    hdf5_file['y_train'][i] = label
    
# Test images
for i in range(len(test_addrs)):
    if i%200 == 0 and i > 1 :
        print(f"Test: Done {i} of {len(test_addrs)}")
    
    addr = test_addrs[i][0]
    label = test_addrs[i][1]
    img = cv2.imread(addr)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_CUBIC )
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    hdf5_file['x_test'][i, ...] = img
    hdf5_file['y_test'][i] = label
    
hdf5_file.close()

Train: Done 200 of 1711
Train: Done 400 of 1711
Train: Done 600 of 1711
Train: Done 800 of 1711
Train: Done 1000 of 1711
Train: Done 1200 of 1711
Train: Done 1400 of 1711
Train: Done 1600 of 1711
Test: Done 200 of 427
Test: Done 400 of 427
