# 目录
## 1. 模块导入
## 2. dataset从内存中读取数据
  - `tf.data.Dataset.from_tensor_slices`
  - `make_one_shot_iterator`
  - `get_next`
  
## 3. dataset从文件（CSV）读取数据
  - ### 3.1 生成CSV文件
  - ### 3.2 读取CSV文件
    - `tf.data.Dataset.list_files`
	- `interleave` 和 `tf.data.TextLineDataset` 和 `skip`
	- `map` 和 `tf.io.decode_csv`
	- `make_one_shot_iterator`
	- `get_next`

## 4. dataset从tf_records读取数据
  - ### 4.1 制作mnist数据tf_records
    - `np.array_split`
	- `tf.python_io.TFRecordWriter`
	- `tf.train.Example` 和 `tf.train.Features` 和 `tf.train.Feature` 和 `tf.train.BytesList` 和 `tf.train.Int64List`
	- `SerializeToString`
  - ### 4.2 读取tf_records数据
    - `tf.data.Dataset.list_files`
	- `interleave` 和 `tf.data.TFRecordDataset` 
	- `map` 和 `tf.io.parse_single_example` `tf.io.FixedLenFeature` `tf.io.decode_raw`
	- `shuffle`
	- `repeat`
	- `batch`
	- `make_one_shot_iterator`
	- `get_next`
    
## 5. initializable_iterator动态初始化数据
  - `placeholder`
  - `tf.data.Dataset.list_files`
  - `interleave` 和 `tf.data.TextLineDataset` 和 `skip`
  - `map` 和 `tf.io.decode_csv`
  - `make_initializable_iterator`
  - `get_next`
  - `initializer`

## 1. 导入模块

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

from tensorflow import keras
import tensorflow as tf
import sys
import os
import time
import datetime

for module in [np, pd, mpl, sklearn, keras, tf]:
    print(module.__name__, module.__version__)

numpy 1.17.2
pandas 0.25.1
matplotlib 3.1.1
sklearn 0.21.3
tensorflow.python.keras.api._v1.keras 2.2.4-tf
tensorflow 1.15.0


## 2. dataset从内存中读取数据

In [2]:
array_data = np.array([1.0, 2.4, 6.9])
dataset = tf.data.Dataset.from_tensor_slices(array_data)

dataset_iter = dataset.make_one_shot_iterator()
item = dataset_iter.get_next()

with tf.Session() as sess:
    for _ in range(3):
        value = sess.run(item)
        print(value)

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
1.0
2.4
6.9


## 3. dataset从文件（CSV）读取数据

  - ### 3.1 生成CSV文件

In [3]:
output_dir = "tf1_dataset"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

csv_path01 = os.path.join(output_dir, "data_01.csv")
csv_path02 = os.path.join(output_dir, "data_02.csv")

with open(csv_path01, "wt") as f:
    f.write("age,sex,salary\n")
    f.write("23,men,2334.8\n")
    f.write("24,men,5433.7\n")
    f.write("25,women,7334.8")
    
with open(csv_path02, "wt") as f:
    f.write("age,sex,salary\n")
    f.write("26,men,2334.8\n")
    f.write("27,men,5433.7\n")
    f.write("28,women,7334.8")

In [4]:
df_data1 = pd.read_csv(csv_path01)
df_data1

Unnamed: 0,age,sex,salary
0,23,men,2334.8
1,24,men,5433.7
2,25,women,7334.8


In [5]:
df_data2 = pd.read_csv(csv_path02)
df_data2

Unnamed: 0,age,sex,salary
0,26,men,2334.8
1,27,men,5433.7
2,28,women,7334.8


  - ### 3.2 读取CSV文件

In [6]:
# csv_dataset = tf.data.TextLineDataset([csv_path01, csv_path02]).skip(1)  这样有问题，因为第一个文件header能跳过，第二个文件名就跳不过了

# 文件列表dataset
filename_dataset = tf.data.Dataset.list_files([csv_path01, csv_path02])

# 根据文件名，读取数据，聚合成一个新的dataset
csv_dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1)
)

# 解析
def parse_csv(line):
    default_fields = [tf.constant(0, dtype=tf.int64), tf.constant("gg", dtype=tf.string), tf.constant(0.0, dtype=tf.float32)]
    parse_line = tf.io.decode_csv(line, default_fields) # 返回的都是list结构，元素都是tensor
    age = parse_line[0]
    sex = parse_line[1]
    salary = parse_line[2]
    return age, sex, salary

# 对dataset元素处理，不改变元素数量
dataset = csv_dataset.map(parse_csv)

# 定义迭代器
dataset_iter = dataset.make_one_shot_iterator()

# 获取一个迭代元素
age, sex, salary = dataset_iter.get_next()

with tf.Session() as sess:
    for _ in range(6):
        age_value, sex_value, salary_value = sess.run([age, sex, salary])
        print("name: age, type: {}, value: {}".format(type(age_value), age_value))
        print("name: sex, type: {}, value: {}".format(type(sex_value), sex_value))
        print("name: salary, type: {}, value: {}".format(type(salary_value), salary_value))

name: age, type: <class 'numpy.int64'>, value: 23
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 2334.800048828125
name: age, type: <class 'numpy.int64'>, value: 26
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 2334.800048828125
name: age, type: <class 'numpy.int64'>, value: 24
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 5433.7001953125
name: age, type: <class 'numpy.int64'>, value: 27
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 5433.7001953125
name: age, type: <class 'numpy.int64'>, value: 25
name: sex, type: <class 'bytes'>, value: b'women'
name: salary, type: <class 'numpy.float32'>, value: 7334.7998046875
name: age, type: <class 'numpy.int64'>, value: 28
name: sex, type: <class 'bytes'>, value: b'women'
name: salary, type: <class 'numpy.float32'>, value: 7334.7998046

## 4. dataset从tf_records读取数据

  - ### 4.1 制作mnist数据tf_records

In [7]:
from tensorflow.examples.tutorials.mnist import input_data

output_dir = "tf1_data"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

mnist = input_data.read_data_sets("tf1_data/MNIST_data", dtype=tf.uint8, one_hot=False)

def mnist_to_tfrecords(images, labels, save_dir, name_prefix, n_parts):
    path_format = os.path.join(save_dir, "{}_{:02d}-of-{:02d}.tfrecords")
    all_filenames = []
    
    for file_index, row_indices in enumerate(np.array_split(np.arange(len(images)), n_parts)):
        filename_fullpath = path_format.format(name_prefix, file_index, n_parts)
        all_filenames.append(filename_fullpath)
        
        with tf.python_io.TFRecordWriter(filename_fullpath) as writer:
            for row_index in row_indices:
                example = tf.train.Example(
                    features=tf.train.Features(
                        feature={
                            "image": tf.train.Feature(bytes_list=tf.train.BytesList(value=[images[row_index].tostring()])),
                            "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[row_index]]))
                        }
                    )
                )
                serialized_example = example.SerializeToString()
                writer.write(serialized_example)
    return all_filenames      

train_filenames = mnist_to_tfrecords(mnist.train.images, mnist.train.labels, output_dir, "train", 10)

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting tf1_data/MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting tf1_data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting tf1_data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting tf1_data/MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


In [8]:
train_filenames

['tf1_data/train_00-of-10.tfrecords',
 'tf1_data/train_01-of-10.tfrecords',
 'tf1_data/train_02-of-10.tfrecords',
 'tf1_data/train_03-of-10.tfrecords',
 'tf1_data/train_04-of-10.tfrecords',
 'tf1_data/train_05-of-10.tfrecords',
 'tf1_data/train_06-of-10.tfrecords',
 'tf1_data/train_07-of-10.tfrecords',
 'tf1_data/train_08-of-10.tfrecords',
 'tf1_data/train_09-of-10.tfrecords']

  - ### 4.2 读取tf_records数据

In [9]:
# 制作文件dataset
filename_dataset = tf.data.Dataset.list_files(train_filenames)

# 根据文件获取文件数据，把数据聚合在一起dataset返回
raw_dataset = filename_dataset.interleave(
    lambda filename: tf.data.TFRecordDataset(filename),
    cycle_length = 3
)

# 定义解析方法
def parse_serialized_example(serialized_example):
    example = tf.io.parse_single_example(
        serialized_example,
        features={
            "image": tf.io.FixedLenFeature([1], tf.string),
            "label": tf.io.FixedLenFeature([1], tf.int64)
        }
    )
    
    image = tf.io.decode_raw(example["image"], tf.uint8)
    image = tf.reshape(image, shape=[784])
    label = example["label"][0]
    
    return image, label

# map 解析，元素，一一对应处理
dataset = raw_dataset.map(
    parse_serialized_example,
    num_parallel_calls=2)

epochs = 100
batch_size = 32

# shuffle 打乱顺序
dataset = dataset.shuffle(10000)

# 数据重复epoch
dataset = dataset.repeat(epochs) # epochs = None ， 重复无线次


# 使用map 做数据增强，元素一一对应处理
'''
def enhance_data(image):
    
#    图片处理过程, resize固定大小, 翻转，裁剪，变色，等等
    
    return image

dataset = dataset.map(enhance_data)
'''
# batch 以batch_size 返回
dataset = dataset.batch(batch_size=batch_size)

# 生成迭代器
dataset_iter = dataset.make_one_shot_iterator()

# 获取一个元素
images, labels = dataset_iter.get_next()

with tf.Session() as sess:
    for _ in range(10):
        images_v, label_v = sess.run([images, labels])
        
        print("images: {}, labels: {}".format(images_v.shape, label_v.shape))


images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)
images: (32, 784), labels: (32,)


## 5. initializable_iterator动态初始化数据

In [10]:
# placeholder
filenames_placeholder = tf.placeholder(tf.string)   ################不一样##################

# 文件列表dataset
filename_dataset = tf.data.Dataset.list_files(filenames_placeholder)

# 根据文件名，读取数据，聚合成一个新的dataset
csv_dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1)
)

# 解析
def parse_csv(line):
    default_fields = [tf.constant(0, dtype=tf.int64), tf.constant("gg", dtype=tf.string), tf.constant(0.0, dtype=tf.float32)]
    parse_line = tf.io.decode_csv(line, default_fields) # 返回的都是list结构，元素都是tensor
    age = parse_line[0]
    sex = parse_line[1]
    salary = parse_line[2]
    return age, sex, salary

# 对dataset元素处理，不改变元素数量
dataset = csv_dataset.map(parse_csv)

# 定义迭代器
dataset_iter = dataset.make_initializable_iterator()

# 获取一个迭代元素
age, sex, salary = dataset_iter.get_next()

with tf.Session() as sess:
    
    sess.run(dataset_iter.initializer, feed_dict = {filenames_placeholder: [csv_path01, csv_path02]}) ################不一样##################
    
    for _ in range(6):
        age_value, sex_value, salary_value = sess.run([age, sex, salary])
        print("name: age, type: {}, value: {}".format(type(age_value), age_value))
        print("name: sex, type: {}, value: {}".format(type(sex_value), sex_value))
        print("name: salary, type: {}, value: {}".format(type(salary_value), salary_value))

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_initializable_iterator(dataset)`.
name: age, type: <class 'numpy.int64'>, value: 23
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 2334.800048828125
name: age, type: <class 'numpy.int64'>, value: 26
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 2334.800048828125
name: age, type: <class 'numpy.int64'>, value: 24
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 5433.7001953125
name: age, type: <class 'numpy.int64'>, value: 27
name: sex, type: <class 'bytes'>, value: b'men'
name: salary, type: <class 'numpy.float32'>, value: 5433.7001953125
name: age, type: <class 'numpy.int64'>, value: 25
name: sex, type: <class 'b