In [1]:
import tensorflow as tf
import numpy as np
import IPython.display as display

In [2]:
tf.__version__

'2.9.0'

### tf.train.Example

#### Data Types for tf.train.Example

In [3]:
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns a int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [4]:
print(_bytes_feature(b'test_string'))
print(_bytes_feature(u'test_bytes'.encode('utf-8')))

print(_float_feature(np.exp(1)))

print(_int64_feature(True))
print(_int64_feature(1))

Metal device set to: Apple M1
bytes_list {
  value: "test_string"
}

bytes_list {
  value: "test_bytes"
}

float_list {
  value: 2.7182817
}

int64_list {
  value: 1
}

int64_list {
  value: 1
}



2022-08-11 08:49:45.984244: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-11 08:49:45.984714: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
feature = _float_feature(np.exp(1))
feature1 = _float_feature(np.exp(1.1))
print(feature, '\n', type(feature))
print(feature.SerializeToString())
print(feature1.SerializeToString())

float_list {
  value: 2.7182817
}
 
 <class 'tensorflow.core.example.feature_pb2.Feature'>
b'\x12\x06\n\x04T\xf8-@'
b'\x12\x06\n\x04BD@@'


#### Creating a tf.train.

In [6]:
n_observations = int(1e4)

feature0 = np.random.choice([False, True], n_observations)

feature1 = np.random.randint(0, 5, n_observations)

strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1] # you can't make random string list if you use list type

feature3 = np.random.randn(n_observations)

In [7]:
def serialize_example(feature0, feature1, feature2, feature3):
    feature = {
        'feature0': _int64_feature(feature0),
        'feature1': _int64_feature(feature1),
        'feature2': _bytes_feature(feature2),
        'feature3': _float_feature(feature3),
    }
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [8]:
example_observation = []

serialized_example = serialize_example(False, 4, b'goat', 0.9876)
serialized_example

b'\nR\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?'

In [9]:
example_proto = tf.train.Example.FromString(serialized_example)
example_proto

features {
  feature {
    key: "feature0"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature1"
    value {
      int64_list {
        value: 4
      }
    }
  }
  feature {
    key: "feature2"
    value {
      bytes_list {
        value: "goat"
      }
    }
  }
  feature {
    key: "feature3"
    value {
      float_list {
        value: 0.9876
      }
    }
  }
}

### TFRecords format details

In [10]:
# uint64 length
# uint32 masked_crc32_of_length
# byte   data[length]
# uint32 masked_crc32_of_data

In [11]:
# masked_crc = ((crc >> 15) | (crc << 17)) + 0xa282ead8ul

### TFRecord files using tf.data

#### Writing a TFRecord file

In [12]:
tf.data.Dataset.from_tensor_slices(feature1)

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [13]:
features_dataset = tf.data.Dataset.from_tensor_slices((feature0, feature1, feature2, feature3))
features_dataset

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.bool, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>

In [14]:
for a, b, c, d in features_dataset.take(1):
    print(a, '\n', b, '\n', c, '\n', d)
    print(a.numpy(),  b.numpy(), c.numpy(), d.numpy())
print(feature0[0], feature1[0], feature2[0], feature3[0])

tf.Tensor(True, shape=(), dtype=bool) 
 tf.Tensor(4, shape=(), dtype=int64) 
 tf.Tensor(b'goat', shape=(), dtype=string) 
 tf.Tensor(1.039732709347987, shape=(), dtype=float64)
True 4 b'goat' 1.039732709347987
True 4 b'goat' 1.039732709347987


In [15]:
def tf_serialize_example(f0, f1, f2, f3):
    tf_string = tf.py_function(serialize_example, 
                              (f0, f1, f2, f3),
                              tf.string)
    return tf.reshape(tf_string, ())

In [16]:
tf_serialize_example(a, b, c, d)

2022-08-11 08:50:00.367405: W tensorflow/core/framework/op_kernel.cc:1733] INVALID_ARGUMENT: TypeError: <tf.Tensor: shape=(), dtype=bool, numpy=True> has type <class 'tensorflow.python.framework.ops.EagerTensor'>, but expected one of: (<class 'int'>,)
Traceback (most recent call last):

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    ret = self._func(*args)

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "/var/folders/99

InvalidArgumentError: TypeError: <tf.Tensor: shape=(), dtype=bool, numpy=True> has type <class 'tensorflow.python.framework.ops.EagerTensor'>, but expected one of: (<class 'int'>,)
Traceback (most recent call last):

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    ret = self._func(*args)

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "/var/folders/99/s34zzfr57xs9_1q57tnxdmrr0000gn/T/ipykernel_24837/1663365771.py", line 3, in serialize_example
    'feature0': _int64_feature(feature0),

  File "/var/folders/99/s34zzfr57xs9_1q57tnxdmrr0000gn/T/ipykernel_24837/2824841238.py", line 16, in _int64_feature
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/google/protobuf/internal/python_message.py", line 542, in init
    copy.extend(field_value)

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/google/protobuf/internal/containers.py", line 143, in extend
    new_values = [self._type_checker.CheckValue(elem) for elem in elem_seq_iter]

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/google/protobuf/internal/containers.py", line 143, in <listcomp>
    new_values = [self._type_checker.CheckValue(elem) for elem in elem_seq_iter]

  File "/Users/rainyseason/miniforge3/envs/tf29/lib/python3.10/site-packages/google/protobuf/internal/type_checkers.py", line 155, in CheckValue
    raise TypeError(message)

TypeError: <tf.Tensor: shape=(), dtype=bool, numpy=True> has type <class 'tensorflow.python.framework.ops.EagerTensor'>, but expected one of: (<class 'int'>,)

 [Op:EagerPyFunc]

In [None]:
serialize_example(False, 1, b'dog', -0.410451315502222)

In [None]:
tf_serialize_example(False, 1, b'dog', -0.410451315502222)

In [None]:
serialized_features_dataset = features_dataset.map(tf_serialize_example)
serialized_features_dataset

In [None]:
def generator():
    for features in features_dataset:
        yield serialize_example(*features)

In [None]:
serialized_features_dataset = tf.data.Dataset.from_generator(generator,
                                                            output_types=tf.string,
                                                            output_shapes=())

In [None]:
serialized_features_dataset

And write them to a TFRecord file:

In [None]:
filename = 'test.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(serialized_features_dataset)

#### Reading a TFRecord file

In [None]:
filenames = [filename]
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset