# Dataset Implementation Testing
## Objective
To test out how to most efficiently implement our input data pipeline

In [1]:
import pathlib
import tensorflow as tf
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from util import timeit

AUTOTUNE = tf.data.experimental.AUTOTUNE

def load_image(
    filename,
    label,
):
    img_raw = tf.io.read_file( filename )
    img_tensor = tf.image.decode_image(
        img_raw,
        channels = 3,
        # Important to include the following
        # https://stackoverflow.com/questions/44942729/tensorflowvalueerror-images-contains-no-shape
        expand_animations = False,
    )
    return ( img_tensor, label )

def preprocessing_default(
    img_tensor,
    label,
):
    img_tensor_preprocessed = tf.image.resize( img_tensor, [ 299, 299 ] )
    img_tensor_preprocessed = img_tensor_preprocessed/255.0
    return ( img_tensor_preprocessed, label )

def load_image_dataset_from_hdf(
    hdf_path,
    hdf_key,
    label_col,
    test_size = 0.05,
    val_size = 0.1,
    preprocessing = preprocessing_default,
):

    df = pd.read_hdf( hdf_path, hdf_key )

    # properties of the base_model
    input_dim = 299
    scale_coef = 1./255

    # model hyperparameter
    batch_size = 64

    ds = tf.data.Dataset.from_tensor_slices(
        (
            df[ 'filename' ].values,
            df[ label_col ].values,
        ),
    )

    ds = ds.map(
        load_image,
        num_parallel_calls=AUTOTUNE,
    )

    ds = ds.map(
        preprocessing,
        # num_parallel_calls=AUTOTUNE,
    )

    ds = ds.batch( batch_size )

    return ds


In [2]:
data_dir = '/media/data/gbif'
hdf_filename = 'clean_data.h5'
hdf_path = os.path.join( data_dir, hdf_filename )
hdf_key = 'media_merged_filtered-by-species_350pt'

label_col = 'acceptedScientificName'
num_batch = 100
batch_size = 64

In [3]:
ds = load_image_dataset_from_hdf(
    hdf_path,
    hdf_key,
    label_col,
)

In [4]:
timeit(
    ds,
    batch_size,
    num_batch,
)

..........
100 batches: 24.269120454788208 s
263.70960 Images/s
Total time: 24.623489141464233s


In [5]:
dsp = ds.prefetch( buffer_size = AUTOTUNE )

timeit(
    dsp,
    batch_size,
    num_batch,
)

..........
100 batches: 24.22631311416626 s
264.17557 Images/s
Total time: 24.470304250717163s


In [6]:
dsc = ds.cache( filename = './cache.tf-data' )

timeit(
    dsc,
    batch_size,
    num_batch,
)

..........
100 batches: 28.69098401069641 s
223.06659 Images/s
Total time: 28.96743893623352s


In [7]:
dspc = dsp.cache( filename = './cache.tf-data' )

timeit(
    dspc,
    batch_size,
    num_batch,
)

..........
100 batches: 25.319931030273438 s
252.76530 Images/s
Total time: 25.62583041191101s


In [8]:
dscp = dsc.prefetch( buffer_size = AUTOTUNE )

timeit(
    dscp,
    batch_size,
    num_batch,
)

..........
100 batches: 36.639801025390625 s
174.67344 Images/s
Total time: 36.9444694519043s
