# Envlogger and TFDS

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a href="https://colab.research.google.com/github/google-research/rlds/blob/main/rlds/examples/tfds_rlu_atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Run In Google Colab"/></a>
  </td>
</table>

In [None]:
#@title Install Pip packages
!pip install rlds
!pip install envlogger[tfds]
!apt-get install libgmp-dev
!pip install numpy

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libgmpxx4ldbl
Suggested packages:
  gmp-doc libgmp10-doc libmpfr-dev
The following NEW packages will be installed:
  libgmp-dev libgmpxx4ldbl
0 upgraded, 2 newly installed, 0 to remove and 37 not upgraded.
Need to get 325 kB of archives.
After this operation, 1,667 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libgmpxx4ldbl amd64 2:6.1.2+dfsg-2 [8,964 B]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libgmp-dev amd64 2:6.1.2+dfsg-2 [316 kB]
Fetched 325 kB in 1s (302 kB/s)
Selecting previously unselected package libgmpxx4ldbl:amd64.
(Reading database ... 155229 files and directories currently installed.)
Preparing to unpack .../libgmpxx4ldbl_2%3a6.1.2+dfsg-2_amd64.deb ...
Unpacking libgmpxx4ldbl:amd64 (2:6.1.2+dfsg-2) ...
Selecting previously unselected package libgmp-dev:amd6

In [None]:
#@title Imports
import os
import rlds
import tensorflow_datasets as tfds
import envlogger
from envlogger.backends import rlds_utils
from envlogger.backends import tfds_backend_writer
from envlogger.testing import catch_env
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import time
from typing import Optional, List

In [None]:
#@title Auxiliary function to get dataset directories

_METADATA_FILENAME='features.json'

def get_ds_paths(pattern: str) -> List[str]:
  """Returns the paths of tfds datasets under a (set of) directories.

  We assume that a sub-directory with features.json file contains the dataset
  files.

  Args:
    pattern: Root directory to search for dataset paths or a glob that matches
      a set of directories, e.g. /some/path or /some/path/prefix*. See
      tf.io.gfile.glob for the supported patterns.

  Returns:
    A list of paths that contain the environment logs.

  Raises:
    ValueError if the specified pattern matches a non-directory.
  """
  paths = set([])
  for root_dir in tf.io.gfile.glob(pattern):
    if not tf.io.gfile.isdir(root_dir):
      raise ValueError(f'{root_dir} is not a directory.')
    print(f'root: {root_dir}')
    for path, _, files in tf.io.gfile.walk(root_dir):
      if _METADATA_FILENAME in files:
        print(f'path: {path}')
        paths.add(path)
  return list(paths)

# Generate a dataset

In this example, we use the local TFDS backend. 

In order to generate the dataset, use the parameters below to configure:

1. `root_dir`: where the dataset will be created.
1. `num_episodes`: how many episodes to generate.
1. `max_episodes_per_shard`: maximum number of episodes to include per file (episodes will be stored in multiple files and then read as a single dataset).

In [None]:
generate_data_dir='/tmp/tensorflow_datasets/catch/' # @param
num_episodes= 20 # @param
max_episodes_per_shard = 1000 # @param

In [None]:
os.makedirs(generate_data_dir, exist_ok=True)

In [None]:
def record_data(data_dir, num_episodes, max_episodes_per_shard):
  env = catch_env.Catch()

  def step_fn(unused_timestep, unused_action, unused_env):
    return {'timestamp_ns': time.time_ns()}

  ds_config = tfds.rlds.rlds_base.DatasetConfig(
        name='catch_example',
        observation_info=tfds.features.Tensor(
            shape=(10, 5),
            dtype=tf.float32,
            encoding=tfds.features.Encoding.ZLIB),
        action_info=tf.int64,
        reward_info=tf.float64,
        discount_info=tf.float64,
        step_metadata_info={'timestamp_ns': tf.int64})

  with envlogger.EnvLogger(
      env,
      backend = tfds_backend_writer.TFDSBackendWriter(
        data_directory=data_dir,
        split_name='train',
        max_episodes_per_file=max_episodes_per_shard,
        ds_config=ds_config),
      step_fn=step_fn) as env:
    print('Done wrapping environment with EnvironmentLogger.')

    print(f'Training a random agent for {num_episodes} episodes...')
    for i in range(num_episodes):
      print(f'episode {i}')
      timestep = env.reset()
      while not timestep.last():
        action = np.random.randint(low=0, high=3)
        timestep = env.step(action)
    print(f'Done training a random agent for {num_episodes} episodes.')

record_data(generate_data_dir, num_episodes, max_episodes_per_shard)

Done wrapping environment with EnvironmentLogger.
Training a random agent for 20 episodes...
episode 0
episode 1
episode 2
episode 3
episode 4
episode 5
episode 6
episode 7
episode 8
episode 9
episode 10
episode 11
episode 12
episode 13
episode 14
episode 15
episode 16
episode 17
episode 18
episode 19
Done training a random agent for 20 episodes.


# Recover a dataset

When the process of generating one dataset didn't finish properly, it is possible for the last shard to be incomplete. Envlogger provides the functionality to recover this last shard.

In [None]:
recover_dataset_path = '/tmp/tensorflow_datasets/catch/' # @param


In [None]:
builder = tfds.core.builder_from_directory(recover_dataset_path)
builder = rlds_utils.maybe_recover_last_shard(builder)

Recover one shard may take some time: if the last shard was not finalized correctly, it needs to read the full shard. Another option is to skip the data
from the last shard and just call `rename_shards` to make sure all shards' files have the correct name (before exit, Envlogger renames all the shard files to comply with TFDS, if this didn't finish properly, the filenames would be incorrect).

In [None]:
builder = tfds.core.builder_from_directory(recover_dataset_path)
rlds_utils.rename_shards(recover_dataset_path, builder.info.splits, builder.name)

# Load one dataset

Loading one dataset generated with the TFDS backend uses just regular TFDS functionality.



In [None]:
load_dataset_path = '/tmp/tensorflow_datasets/catch/' # @param


In [None]:
loaded_dataset = tfds.core.builder_from_directory(load_dataset_path).as_dataset(split='all')

for e in loaded_dataset:
  print(e)


{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: t

# Load a dataset from multiple directories

Although TFDS doesn't support direct loading of one dataset from multiple directories, it is possible to do so with `tf.data` functions as long as the data has the same shape. Below we provide several examples of how to combine data from multiple runs of the environment.


In [None]:
multiple_dataset_path = '/tmp/tensorflow_datasets/catch' # @param
subdir_A = 'subdir_A' # @param
subdir_B = 'subdir_B' # @param

dir_A = os.path.join(multiple_dataset_path, subdir_A)
dir_B = os.path.join(multiple_dataset_path, subdir_B)

In [None]:
os.makedirs(dir_A, exist_ok=True)
os.makedirs(dir_B, exist_ok=True)
record_data(dir_A, num_episodes, max_episodes_per_shard)
record_data(dir_B, num_episodes, max_episodes_per_shard)

Done wrapping environment with EnvironmentLogger.
Training a random agent for 20 episodes...
episode 0
episode 1
episode 2
episode 3
episode 4
episode 5
episode 6
episode 7
episode 8
episode 9
episode 10
episode 11
episode 12
episode 13
episode 14
episode 15
episode 16
episode 17
episode 18
episode 19
Done training a random agent for 20 episodes.
Done wrapping environment with EnvironmentLogger.
Training a random agent for 20 episodes...
episode 0
episode 1
episode 2
episode 3
episode 4
episode 5
episode 6
episode 7
episode 8
episode 9
episode 10
episode 11
episode 12
episode 13
episode 14
episode 15
episode 16
episode 17
episode 18
episode 19
Done training a random agent for 20 episodes.


## Option 1: Interleave. 

This should allow for better shuffling. The better paramenters for cycle and so on would have to be investigated (the underlying datasets are also using interleaving to shuffle the reads from the files underneath).

In [None]:
def load_tfds_dataset_interleave(pattern:str,
                      cycle_length: int=tf.data.AUTOTUNE,
                      block_length: Optional[int]=None,
                      num_parallel_calls: int=tf.data.AUTOTUNE,
                      deterministic: bool=False,
                      ) -> tf.data.Dataset:
  paths = get_ds_paths(pattern)
  builders = [tfds.core.builder_from_directory(p).as_dataset(split='all') for p in paths]
  return tf.data.Dataset.from_tensor_slices(builders).interleave(
      lambda ds: ds, 
      cycle_length=cycle_length, 
      block_length=block_length, 
      num_parallel_calls=num_parallel_calls, 
      deterministic=deterministic)


ds = load_tfds_dataset_interleave(multiple_dataset_path)

for e in ds.take(5):
  print(e)


root: /tmp/tensorflow_datasets/catch
path: /tmp/tensorflow_datasets/catch
path: /tmp/tensorflow_datasets/catch/subdir_B
path: /tmp/tensorflow_datasets/catch/subdir_A
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, t

## Option 2: Flat Map

This is the simplest option to read the datasets sequentially.

In [None]:
def load_tfds_dataset_flat_map(pattern:str) -> tf.data.Dataset:
  paths = get_ds_paths(pattern)
  builders = [tfds.core.builder_from_directory(p).as_dataset(split='all') for p in paths]

  return tf.data.Dataset.from_tensor_slices(builders).flat_map(lambda ds: ds)


ds = load_tfds_dataset_flat_map(multiple_dataset_path)

for e in ds.take(5):
  print(e)

root: /tmp/tensorflow_datasets/catch
path: /tmp/tensorflow_datasets/catch
path: /tmp/tensorflow_datasets/catch/subdir_B
path: /tmp/tensorflow_datasets/catch/subdir_A
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, t

## Option 3: Concatenate
Another option to read the datasets sequentially. 

In [None]:
def load_tfds_dataset_concat(pattern:str) -> tf.data.Dataset:
  paths = get_ds_paths(pattern)
  builders = [tfds.core.builder_from_directory(p).as_dataset(split='all') for p in paths]
  ds = builders[0]
  for i in range(1, len(builders)):
    ds = ds.concatenate(builders[i])
  return ds

ds = load_tfds_dataset_concat(multiple_dataset_path)

for e in ds.take(5):
  print(e)

root: /tmp/tensorflow_datasets/catch
path: /tmp/tensorflow_datasets/catch
path: /tmp/tensorflow_datasets/catch/subdir_B
path: /tmp/tensorflow_datasets/catch/subdir_A
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, types: {action: tf.int64, discount: tf.float64, is_first: tf.bool, is_last: tf.bool, is_terminal: tf.bool, observation: tf.float32, reward: tf.float64, timestamp_ns: tf.int64}>}
{'steps': <_VariantDataset shapes: {action: (), discount: (), is_first: (), is_last: (), is_terminal: (), observation: (10, 5), reward: (), timestamp_ns: ()}, t