# 目录
## 1. 导入模块
## 2. 制作tf_records的基础API及步骤
  - `tf.train.Example`
  - `tf.train.features`
  - `tf.train.feature`
  - `tf.train.BytesList`
  - `tf.train.FloatList`
  - `tf.train.Int64List`
  - `SerializeToString`
  - `tf.io.TFRecordWriter`
  
## 3. 读取tfrecords的API（步骤）
  - `tf.data.TFRecordDataset`
  - `tf.io.VarLenFeature`
  - `tf.io.FixedLenFeature`
  - `tf.io.parse_single_example`
  - `tf.sparse.to_dense`
  
## 4. 压缩tfrecord
### 4.1 制作压缩文件
  - `tf.train.Example`
  - `tf.train.features`
  - `tf.train.feature`
  - `tf.train.BytesList`
  - `tf.train.FloatList`
  - `tf.train.Int64List`
  - `SerializeToString`
  - `tf.io.TFRecordOptions` 参数 `compression_type="GZIP"
  - `tf.io.TFRecordWriter` 多传一个参数`options`

### 4.2 读取压缩文件
  - `tf.data.TFRecordDataset` 多传一个参数 `compression_type="GZIP"`
  - `tf.io.VarLenFeature`
  - `tf.io.FixedLenFeature`
  - `tf.io.parse_single_example`
  - `tf.sparse.to_dense`

## 1. 导入模块

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

from tensorflow import keras
import tensorflow as tf
import sys
import os
import time
import datetime

for module in [np, pd, mpl, sklearn, keras, tf]:
    print(module.__name__, module.__version__)

numpy 1.18.1
pandas 0.25.3
matplotlib 3.1.2
sklearn 0.22.1
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf
tensorflow 2.1.0


## 2. 制作tf_records的基础API及步骤
>数据结构<br/>
`tf.train.Example` <br/>
`tf.train.features` ---> {"key": `tf.train.Feature`}<br/>
`tf.train.feature` ---> `tf.train.BytesList` / `tf.train.FloatList` / `tf.train.Int64List`

In [3]:
## tf.train.BytesList / tf.train.FloatList / tf.train.Int64List

favorite_book = [name.encode("utf-8") for name in ["machine learning", "deep learning"]]
favorite_book_byteslist = tf.train.BytesList(value=favorite_book)
print(favorite_book_byteslist)

hours_floatlist = tf.train.FloatList(value=[45.4, 45.8, 78.45, 78.0])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value=[32])
print(age_int64list)

# tf.train.Feature 
feature = {
    "favorite_book": tf.train.Feature(bytes_list=favorite_book_byteslist),
    "hours": tf.train.Feature(float_list=hours_floatlist),
    "age": tf.train.Feature(int64_list=age_int64list)
}

#  tf.train.Features
features = tf.train.Features(feature=feature)

# tf.train.Examples
example = tf.train.Example(features=features)

###########一气呵成，一起写#########
"""
example = tf.train.Example(
    features=tf.train.Features(
        feature={
            "favorite_book": tf.train.Feature(bytes_list=tf.train.BytesList(value=favorite_book)),
            "hours": tf.train.Feature(float_list=tf.train.FloatList(value=[45.4, 45.8, 78.45, 78.0])),
            "age": tf.train.Feature(int64_list=tf.train.Int64List(value=[32]))
        }
    )
)

"""

# serialized example
serialized_example = example.SerializeToString()


# 写入文件
output_dir = "tfrecords_basic"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir, filename)

with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for _ in range(3):
        writer.write(serialized_example)

value: "machine learning"
value: "deep learning"

value: 45.400001525878906
value: 45.79999923706055
value: 78.44999694824219
value: 78.0

value: 32



## 3. 读取tfrecords的API（步骤）
> 读取方法跟csv的差不多<br/>
1. `tf.data.TFRecordDataset` 读取文件名，返回一个dataset <br/>
2. 遍历 dataset，遍历的元素 使用`tf.io.parse_single_example`

In [12]:
# 根据 tfrecord的名字，返回一个 dataset
dataset = tf.data.TFRecordDataset([filename_fullpath])

# 遍历dataset, 并解析
expected_features = {
    "favorite_book": tf.io.VarLenFeature(dtype=tf.string), # value 的list，元素的长度不固定
    "hours": tf.io.VarLenFeature(dtype=tf.float32),
    "age": tf.io.FixedLenFeature([], dtype=tf.int64) # value 的list，元素的长度固定
}

for serialized_example in dataset:
    example = tf.io.parse_single_example(serialized_example, expected_features)
    
    print(example)
    
    # favorite_book 是SparseTensor类型的，就要用 tf.sparse.to_dense 解析
    favorite_book = tf.sparse.to_dense(example["favorite_book"], default_value=b"")
    print(favorite_book.numpy())
    
    hours = tf.sparse.to_dense(example["hours"], default_value=0.0)
    print(hours.numpy())
    
    age = example["age"]
    print(age.numpy())

{'favorite_book': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f5390093198>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f53900939e8>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=32>}
[b'machine learning' b'deep learning']
[45.4  45.8  78.45 78.  ]
32
{'favorite_book': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f53900931d0>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f539008dd30>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=32>}
[b'machine learning' b'deep learning']
[45.4  45.8  78.45 78.  ]
32
{'favorite_book': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f5390093198>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f539008d2b0>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=32>}
[b'machine learning' b'deep learning']
[45.4  45.8  78.45 78.  ]
32


## 4. 压缩tfrecord

### 4.1 制作压缩文件
>`options = tf.io.TFRecordOptions(compression_type="GZIP")`<br/>
`with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:`

写操作多了一个`option`,其它都一样

In [15]:
favorite_book = [name.encode("utf-8") for name in ["machine learning", "deep learning"]]

example = tf.train.Example(
    features=tf.train.Features(
        feature={
            "favorite_book": tf.train.Feature(bytes_list=tf.train.BytesList(value=favorite_book)),
            "hours": tf.train.Feature(float_list=tf.train.FloatList(value=[45.4, 45.8, 78.45, 78.0])),
            "age": tf.train.Feature(int64_list=tf.train.Int64List(value=[32]))
        }
    )
)

serialized_example = example.SerializeToString()

filename_fullpath_zip = filename_fullpath+".zip"

options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for _ in range(3):
        writer.write(serialized_example)

### 4.2 读取压缩文件
>`tf.data.TFRecordDataset([filename_fullpath_zip], compression_type="GZIP")`<br/>
多了一个`compression_type="GZIP"` 参数，其它都一样

In [17]:
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], compression_type="GZIP")

# 遍历dataset, 并解析
expected_features = {
    "favorite_book": tf.io.VarLenFeature(dtype=tf.string), # value 的list，元素的长度不固定
    "hours": tf.io.VarLenFeature(dtype=tf.float32),
    "age": tf.io.FixedLenFeature([], dtype=tf.int64) # value 的list，元素的长度固定
}

for serialized_example in dataset_zip:
    example = tf.io.parse_single_example(serialized_example, expected_features)
    
    print(example)
    
    # favorite_book 是SparseTensor类型的，就要用 tf.sparse.to_dense 解析
    favorite_book = tf.sparse.to_dense(example["favorite_book"], default_value=b"")
    print(favorite_book.numpy())
    
    hours = tf.sparse.to_dense(example["hours"], default_value=0.0)
    print(hours.numpy())
    
    age = example["age"]
    print(age.numpy())

{'favorite_book': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f5334632c50>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f53346320f0>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=32>}
[b'machine learning' b'deep learning']
[45.4  45.8  78.45 78.  ]
32
{'favorite_book': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f5334632d30>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f5334632748>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=32>}
[b'machine learning' b'deep learning']
[45.4  45.8  78.45 78.  ]
32
{'favorite_book': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f5334632860>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f5334632eb8>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=32>}
[b'machine learning' b'deep learning']
[45.4  45.8  78.45 78.  ]
32
