In [40]:
import tensorflow as tf
import time 

In [41]:
class FileDataset(tf.data.Dataset):
    def read_files_in_batches(num_samples):
        #Open_files
        time.sleep(0.10)
        for sample_idx in range(num_samples):
            time.sleep(0.01)
            yield (sample_idx,)
    
    def __new__(cls, num_samples=3):
        return tf.data.Dataset.from_generator(
            cls.read_files_in_batches,
            output_signature=tf.TensorSpec(shape=(1,),dtype=tf.int64),
            args=(num_samples,)
        )

In [42]:
def benchmark(dataset,num_epochs=2):
    for epoch_num in range(num_epochs):
        for sample in dataset:
            time.sleep(0.05)

In [43]:
%%timeit
benchmark(FileDataset())

683 ms ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

674 ms ± 6.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
#now cache to do not read files in each epochs
dataset = tf.data.Dataset.range(5)
for d in dataset:
    print(d.numpy())


0
1
2
3
4


In [46]:
dataset = dataset.map(lambda x: x**2)
for d in dataset:
    print(d.numpy())

0
1
4
9
16


In [47]:
dataset = dataset.cache()
for d in dataset.as_numpy_iterator():
    print(d)

0
1
4
9
16


In [51]:
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [53]:
"""Like previously when you are using cache  function  then it does not execute again map function it simply fetch data from cache"""

'Like previously when you are using cache  function  then it does not execute again map function it simply fetch data from cache'

In [54]:
def mapped_function(s):
    tf.py_function(lambda: time.sleep(0.1),[],())
    return s


In [57]:
%%timeit
benchmark(FileDataset().map(mapped_function), 5)

2.86 s ± 21.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [58]:
%%timeit
benchmark(FileDataset().map(mapped_function).cache(), 5)

1.34 s ± 4.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
