## dataset api
### tensorflow 2.0 api
* https://www.tensorflow.org/api_docs/python/tf/data/Dataset

In [1]:
import tensorflow as tf
tf.__version__


'2.3.0'

In [2]:
import numpy as np
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
for element in dataset:
    print(element)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)


In [3]:
dataset=tf.data.TextLineDataset(["file1.txt","file2.txt"])
dataset=tf.data.TFRecordDataset(["file1.tfrecords","file2.tfrecords"])
# dataset=tf.data.Dataset.list_files("/path/*.txt")

In [4]:
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
dataset = dataset.map(lambda x: x*2)
list(dataset.as_numpy_iterator())


[2, 4, 6]

In [11]:
import collections
a = 1 # Integer element
b = 2.0 # Float element
c = (1, 2) # Tuple element with 2 components
d = {"a": (2, 2), "b": 3} # Dict element with 3 components
Point = collections.namedtuple("Point", ["x", "y"]) # doctest: +SKIP
e = Point(1, 2) # Named tuple # doctest: +SKIP
f = tf.data.Dataset.range(10) # Dataset element

In [13]:
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
dataset.element_spec


TensorSpec(shape=(), dtype=tf.int32, name=None)

In [18]:
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
for element in dataset.as_numpy_iterator():
    print(element)




1
2
3


In [16]:
dataset = tf.data.Dataset.range(100)
def dataset_fn(ds):
    return ds.filter(lambda x: x < 5)
dataset = dataset.apply(dataset_fn)
list(dataset.as_numpy_iterator())


[0, 1, 2, 3, 4]

In [19]:
dataset = tf.data.Dataset.from_tensor_slices({'a': ([1, 2], [3, 4]),
                                              'b': [5, 6]})
list(dataset.as_numpy_iterator()) == [{'a': (1, 3), 'b': 5},
                                      {'a': (2, 4), 'b': 6}]


True

In [22]:
dataset = tf.data.Dataset.range(8)
dataset = dataset.batch(3)
list(dataset.as_numpy_iterator())


[array([0, 1, 2]), array([3, 4, 5]), array([6, 7])]

In [24]:
dataset = tf.data.Dataset.range(8)
#drop_remainder A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements; the default behavior is not to drop the smaller batch.
dataset = dataset.batch(3, drop_remainder=True)
list(dataset.as_numpy_iterator())


[array([0, 1, 2]), array([3, 4, 5])]

In [26]:
dataset = tf.data.Dataset.range(5)
dataset = dataset.map(lambda x: x**2)
dataset = dataset.cache()
# The first time reading through the data will generate the data using
# `range` and `map`.
list(dataset.as_numpy_iterator())

# Subsequent iterations read from the cache.
list(dataset.as_numpy_iterator())


[0, 1, 4, 9, 16]

In [28]:
dataset = tf.data.Dataset.range(5)
dataset = dataset.cache("./file")  # doctest: +SKIP
list(dataset.as_numpy_iterator())  # doctest: +SKIP
# [0,1,2,3,4]
dataset = tf.data.Dataset.range(10)
dataset = dataset.cache("./file")  # Same file! # doctest: +SKIP
list(dataset.as_numpy_iterator())  # doctest: +SKIP
# [0,1,2,3,4]
# If you wish to randomize the iteration order, make sure to call shuffle after calling cache.

[0, 1, 2, 3, 4]

In [31]:
a = tf.data.Dataset.range(1, 4)  # ==> [ 1, 2, 3 ]
b = tf.data.Dataset.range(4, 8)  # ==> [ 4, 5, 6, 7 ]
ds = a.concatenate(b)
list(ds.as_numpy_iterator())

[1, 2, 3, 4, 5, 6, 7]

In [33]:
# The input dataset and dataset to be concatenated should have the same
# nested structures and output types.
c = tf.data.Dataset.zip((a, b))
a.concatenate(c)

TypeError: Two datasets to concatenate have different types <dtype: 'int64'> and (tf.int64, tf.int64)

In [35]:
a = tf.data.Dataset.range(1, 5)  
b = tf.data.Dataset.range(4, 8)
c=tf.data.Dataset.zip((a,b))
c

<ZipDataset shapes: ((), ()), types: (tf.int64, tf.int64)>

In [37]:
d = tf.data.Dataset.from_tensor_slices(["a", "b", "c"])
a.concatenate(d)

TypeError: Two datasets to concatenate have different types <dtype: 'int64'> and <dtype: 'string'>

In [39]:
dataset=tf.data.Dataset.from_tensor_slices([1,2,3])
dataset=dataset.enumerate(start=5)
for element in dataset.as_numpy_iterator():
    print(element)

(5, 1)
(6, 2)
(7, 3)


In [41]:
# The nested structure of the input dataset determines the structure of
# elements in the resulting dataset.
dataset = tf.data.Dataset.from_tensor_slices([(7, 8), (9, 10)])
dataset = dataset.enumerate()
for element in dataset.as_numpy_iterator():
    print(element)

(0, array([7, 8], dtype=int32))
(1, array([ 9, 10], dtype=int32))


In [43]:
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
dataset = dataset.filter(lambda x: x < 3)
list(dataset.as_numpy_iterator())

# `tf.math.equal(x, y)` is required for equality comparison
def filter_fn(x):
    return tf.math.equal(x, 1)
dataset = dataset.filter(filter_fn)
list(dataset.as_numpy_iterator())


[1]

In [55]:
dataset = tf.data.Dataset.from_tensor_slices(
               [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# from tensorflow.data import *
dataset = dataset.flat_map(lambda x:tf.data.Dataset.from_tensor_slices(x))
list(dataset.as_numpy_iterator())


[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [56]:
dataset = Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
dataset = dataset.map(lambda x: x + 1)
list(dataset.as_numpy_iterator())


[2, 3, 4, 5, 6]

In [57]:
dataset = Dataset.range(5)
# `map_func` takes a single argument of type `tf.Tensor` with the same
# shape and dtype.
result = dataset.map(lambda x: x + 1)

In [59]:
# Each element is a tuple containing two `tf.Tensor` objects.
elements = [(1, "foo"), (2, "bar"), (3, "baz")]
dataset = tf.data.Dataset.from_generator(
    lambda: elements, (tf.int32, tf.string))
# `map_func` takes two arguments of type `tf.Tensor`. This function
# projects out just the first component.
result = dataset.map(lambda x_int, y_str: x_int)
list(result.as_numpy_iterator())


[1, 2, 3]

In [61]:
# Each element is a dictionary mapping strings to `tf.Tensor` objects.
elements =  ([{"a": 1, "b": "foo"},
              {"a": 2, "b": "bar"},
              {"a": 3, "b": "baz"}])
dataset = tf.data.Dataset.from_generator(
    lambda: elements, {"a": tf.int32, "b": tf.string})
# `map_func` takes a single argument of type `dict` with the same keys
# as the elements.
result = dataset.map(lambda d: str(d["a"]) + d["b"])

In [64]:
print(result)

<MapDataset shapes: <unknown>, types: tf.string>


###  Python primitives, lists, and NumPy arrays are implicitly converted to `tf.Tensor`.

In [65]:

dataset = tf.data.Dataset.range(3)
# `map_func` returns two `tf.Tensor` objects.
def g(x):
    return tf.constant(37.0), tf.constant(["Foo", "Bar", "Baz"])
result = dataset.map(g)
result.element_spec

# Python primitives, lists, and NumPy arrays are implicitly converted to
# `tf.Tensor`.
def h(x):
    return 37.0, ["Foo", "Bar"], np.array([1.0, 2.0], dtype=np.float64)
result = dataset.map(h)
result.element_spec

# `map_func` can return nested structures.
def i(x):
    return (37.0, [42, 16]), "foo"
result = dataset.map(i)
result.element_spec




((TensorSpec(shape=(), dtype=tf.float32, name=None),
  TensorSpec(shape=(2,), dtype=tf.int32, name=None)),
 TensorSpec(shape=(), dtype=tf.string, name=None))

In [67]:
d=tf.data.Dataset.from_tensor_slices(['hello','world'])
def upper_case_fn(t:tf.Tensor):
    return t.numpy().decode('utf-8').upper()
d=d.map(lambda x:tf.py_function(func=upper_case_fn,
                               inp=[x],Tout=tf.string))
list(d.as_numpy_iterator())

[b'HELLO', b'WORLD']

In [None]:
# Performance can often be improved by setting num_parallel_calls 
# so that map will use multiple threads to process elements. 
# If deterministic order isn't required, it can also improve performance 
# to set deterministic=False.
dataset = Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
dataset = dataset.map(lambda x: x + 1,
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False)

## dataset.padded_batch

### tf2 例子

In [68]:
A = (tf.data.Dataset.range(1, 5, output_type=tf.int32)
     .map(lambda x: tf.fill([x], x)))
# Pad to the smallest per-batch size that fits all elements.
B = A.padded_batch(2)
for element in B.as_numpy_iterator():
      print(element)



Cause: could not parse the source code:

     .map(lambda x: tf.fill([x], x)))

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

     .map(lambda x: tf.fill([x], x)))

This error may be avoided by creating the lambda in a standalone statement.

[[1 0]
 [2 2]]
[[3 3 3 0]
 [4 4 4 4]]


In [75]:
print(list(A.as_numpy_iterator()))

# Pad to a fixed size.
C = A.padded_batch(2, padded_shapes=5)
for element in C.as_numpy_iterator():
      print(element)

[array([1], dtype=int32), array([2, 2], dtype=int32), array([3, 3, 3], dtype=int32), array([4, 4, 4, 4], dtype=int32)]
[[1 0 0 0 0]
 [2 2 0 0 0]]
[[3 3 3 0 0]
 [4 4 4 4 0]]


In [76]:

# Pad with a custom value.
D = A.padded_batch(2, padded_shapes=5, padding_values=-1)
for element in D.as_numpy_iterator():
      print(element)

[[ 1 -1 -1 -1 -1]
 [ 2  2 -1 -1 -1]]
[[ 3  3  3 -1 -1]
 [ 4  4  4  4 -1]]


In [77]:

# Components of nested elements can be padded independently.
elements = [([1, 2, 3], [10]),
            ([4, 5], [11, 12])]
dataset = tf.data.Dataset.from_generator(
    lambda: iter(elements), (tf.int32, tf.int32))
# Pad the first component of the tuple to length 4, and the second
# component to the smallest size that fits.
dataset = dataset.padded_batch(2,
    padded_shapes=([4], [None]),
    padding_values=(-1, 100))
list(dataset.as_numpy_iterator())


# Pad with a single value and multiple components.
E = tf.data.Dataset.zip((A, A)).padded_batch(2, padding_values=-1)
for element in E.as_numpy_iterator():
    print(element)


(array([[ 1, -1],
       [ 2,  2]], dtype=int32), array([[ 1, -1],
       [ 2,  2]], dtype=int32))
(array([[ 3,  3,  3, -1],
       [ 4,  4,  4,  4]], dtype=int32), array([[ 3,  3,  3, -1],
       [ 4,  4,  4,  4]], dtype=int32))



### tf1.x例子
* https://blog.csdn.net/z2539329562/article/details/89791783

In [None]:
import tensorflow as tf
# 
# tf1.x需要手动开启eager模式
tf.enable_eager_execution()
tf.__version__
print(tf.executing_eagerly())

In [4]:

tf.reset_default_graph()
 
x = [[1, 0, 0],
     [2, 3, 0],
     [4, 5, 6],
     [7, 8, 0],
     [9, 0, 0],
     [0, 1, 0]]
x_new = [np.array(i) for i in x]
print(x_new)

[array([1, 0, 0]), array([2, 3, 0]), array([4, 5, 6]), array([7, 8, 0]), array([9, 0, 0]), array([0, 1, 0])]


In [5]:
#tf.TensorShape([])     表示长度为单个数字
#tf.TensorShape([None]) 表示长度未知的向量
padded_shapes=(
        tf.TensorShape([None])
        )
 
#   padded_shapes=(
#        tf.TensorShape([None]),
#        )
#TypeError: Expected int64, got TensorShape([Dimension(None)]) of type 'TensorShape' instead.
# 注意，在tf.TensorShape([None])后面不能添加 ",",因为这里递归嵌套，会认为","后面还有一维数据，
# 只是数据格式为 None。
 
dataset = tf.data.Dataset.from_tensor_slices(x)
iterator1=dataset.make_one_shot_iterator()

dataset = dataset.padded_batch(2, padded_shapes=padded_shapes)
iterator = dataset.make_one_shot_iterator()
sess = tf.Session()
try:
    while True:
        print(sess.run(iterator.get_next()))
        print("="*10)
#         print(sess.run(iterator1.get_next()))
except tf.errors.OutOfRangeError:
    print("end")

RuntimeError: The Session graph is empty.  Add operations to the graph before calling run().

In [None]:
dataset = tf.data.Dataset.range(100)
dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))

# dataset = dataset.padded_batch(4, padded_shapes=[None])
dataset = dataset.batch(2)
 
 

In [None]:
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
 
print(sess.run(next_element)) 
# print(sess.run(next_element))  
# print(sess.run(next_element))  
# print(sess.run(next_element))  
# print(sess.run(next_element))  

In [None]:

tf.reset_default_graph()
 
x = [[1, 0, 0],
     [2, 3, 0],
     [4, 5, 6],
     [7, 8, 0],
     [9, 0, 0],
     [0, 1, 0]]
 
 
 
#tf.TensorShape([])     表示长度为单个数字
#tf.TensorShape([None]) 表示长度未知的向量
padded_shapes=(
        tf.TensorShape([]),
        tf.TensorShape([]),
        tf.TensorShape([])
        )
 
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.map(lambda x: [x[0], x[1], x[2]])
dataset = dataset.padded_batch(2, padded_shapes=padded_shapes)
iterator = dataset.make_one_shot_iterator()
sess = tf.Session()
try:
    while True:
        elem1, elem2, elem3 = iterator.get_next()
        print("elem:", sess.run(elem1))
except tf.errors.OutOfRangeError:
    print("end")