# Dataset loading time test - do not launch now, may cause inconsistency in datasets
Comparison of loading datasets from public libraries and from personal Google Drive. Time and size compared.

## Conclusion
It is definitely a good idea to preprocess the data, save it as NumPy N-D array and load it before training. The best solution is to store the dataset in NPZ format. Its size is multiple times smaller and loading times are still very low. In comparison to time the training takes they are negligible.

## Launch first

In [0]:
import time
import numpy
import tensorflow
from sklearn import datasets
import matplotlib.image

drive_path = '/content/drive/My Drive/VUT FIT/IBT/datasets/'
mnist_name = 'mnist_32_32'
lfw_name = 'lfw_64_48'
ffhq_name = 'ffhq_128_128'

## MNIST
* Loading from library - 0.358 s (cached - 0.318 s)
* Processing - 50.736 s
* Loading from NPY - 0.138 s (size 234 MB)
* Loading from NPZ - 0.771 s (size 16 MB)

### Library

In [0]:
start_time = time.time()

(mnist_loaded, _), (_, _) = tensorflow.keras.datasets.mnist.load_data()

end_time = time.time()
print(end_time - start_time)

In [0]:
start_time = time.time()

mnist_processed = numpy.empty((len(mnist_loaded), 32, 32))
for image_index in range(len(mnist_loaded)):
	for row_index in range(len(mnist_loaded[0])):
		mnist_processed[image_index][row_index+2][0] = 0
		mnist_processed[image_index][row_index+2][1] = 0
		mnist_processed[image_index][row_index+2][-1] = 0
		mnist_processed[image_index][row_index+2][-2] = 0
		for column_index in range(len(mnist_loaded[0][0])):
			mnist_processed[image_index][row_index+2][column_index+2] = mnist_loaded[image_index][row_index][column_index]
	mnist_processed[image_index][0] = numpy.zeros((1, 32))
	mnist_processed[image_index][1] = numpy.zeros((1, 32))
	mnist_processed[image_index][-1] = numpy.zeros((1, 32))
	mnist_processed[image_index][-2] = numpy.zeros((1, 32))
mnist_processed = numpy.expand_dims(mnist_processed, axis=-1)
mnist_processed = mnist_processed.astype('float32') / 255.0

end_time = time.time()
print(end_time - start_time)

### NPY

In [0]:
filename = drive_path + mnist_name + '/' + mnist_name + '.npy'
numpy.save(filename, mnist_processed)

In [0]:
start_time = time.time()

filename = drive_path + mnist_name + '/' + mnist_name + '.npy'
mnist_reloaded = numpy.load(filename)

end_time = time.time()
print(end_time - start_time)

### NPZ

In [0]:
filename = drive_path + mnist_name + '/' + mnist_name + '.npz'
numpy.savez_compressed(filename, mnist_processed)

In [0]:
start_time = time.time()

filename = drive_path + mnist_name + '/' + mnist_name + '.npz'
mnist_reloaded = numpy.load(filename)
mnist_reloaded = mnist_reloaded['arr_0']

end_time = time.time()
print(end_time - start_time)

0.771331545999999


## LFW
* Loading from library - 84.656 s (cached - 2.510 s)
* Processing - 72.729 s
* Loading from NPY - 0.533 s (size 930 MB)
* Loading from NPZ - 3.803 s (size 156 MB)

### PNG

In [0]:
start_time = time.time()

lfw_loaded = sklearn.datasets.fetch_lfw_people(color = True)

end_time = time.time()
print(end_time - start_time)

In [0]:
start_time = time.time()

lfw_processed = numpy.empty((len(lfw_loaded.images), 64, 48, 3))
for image_index in range(len(lfw_loaded.images)):
	lfw_processed[image_index] = numpy.empty((64, 48, 3))
	for row_index in range(len(lfw_loaded.images[image_index])):
		lfw_processed[image_index][row_index+1][0] = lfw_loaded.images[image_index][row_index][0]
		for column_index in range(len(lfw_loaded.images[image_index][0])):
			lfw_processed[image_index][row_index+1][column_index+1] = lfw_loaded.images[image_index][row_index][column_index]
	lfw_processed[image_index][0] = lfw_processed[image_index][1]
	lfw_processed[image_index][-1] = lfw_processed[image_index][-2]
X = lfw_processed.astype('float32') / 255.0

end_time = time.time()
print(end_time - start_time)

### NPY

In [0]:
filename = drive_path + lfw_name + '/' + lfw_name + '.npy'
numpy.save(filename, lfw_processed)

In [0]:
start_time = time.time()

filename = drive_path + lfw_name + '/' + lfw_name + '.npy'
lfw_reloaded = numpy.load(filename)

end_time = time.time()
print(end_time - start_time)

### NPZ

In [0]:
filename = drive_path + lfw_name + '/' + lfw_name + '.npz'
numpy.savez_compressed(filename, lfw_processed)

In [0]:
start_time = time.time()

filename = drive_path + lfw_name + '/' + lfw_name + '.npz'
lfw_reloaded = numpy.load(filename)
lfw_reloaded = lfw_reloaded['arr_0']

end_time = time.time()
print(end_time - start_time)


## FFHQ
1000 images
* Loading from PNG - 409.333 s (size 29 MB)
* Loading from NPY - 0.212 s (size 375 MB)
* Loading from NPZ - 1.520 s (size 68 MB)

### Library

In [0]:
start_time = time.time()

ffhq_size = 1000
ffhq_processed = numpy.empty((ffhq_size, 128, 128, 3))
 
for i in range(ffhq_size):
  filename = drive_path + ffhq_name + '-png/%05d.png' % (i)
  ffhq_processed[i] = matplotlib.image.imread(filename)

end_time = time.time()
print(end_time - start_time)

1583965168.2069967
1583965577.539739
409.3327422142029


### NPY

In [0]:
filename = drive_path + ffhq_name + '/' + ffhq_name + '.npy'
numpy.save(filename, ffhq_processed)

In [0]:
start_time = time.time()

filename = drive_path + ffhq_name + '/' + ffhq_name + '.npy'
ffhq_reloaded = numpy.load(filename)

end_time = time.time()
print(end_time - start_time)

### NPZ

In [0]:
filename = drive_path + ffhq_name + '/' + ffhq_name + '.npz'
numpy.savez_compressed(filename, ffhq_processed)

In [0]:
start_time = time.time()

filename = drive_path + ffhq_name + '/' + ffhq_name + '.npz'
ffhq_reloaded = numpy.load(filename)
ffhq_reloaded = ffhq_reloaded['arr_0']

end_time = time.time()
print(end_time - start_time)