In [1]:
import matplotlib as mpl #画图用的库
import matplotlib.pyplot as plt
#下面这一句是为了可以在notebook中画图
%matplotlib inline
import numpy as np
import sklearn   #机器学习算法库
import pandas as pd #处理数据的库   
import os
import sys
import time
import tensorflow as tf
 
from tensorflow import keras   #使用tensorflow中的keras
#import keras #单纯的使用keras
 
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, sklearn, pd, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.0
sklearn 0.21.3
pandas 0.25.3
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
#tfrecord 为tensorflow自己创建的一种文件格式
#TFRecord内部使用了“Protocol Buffer”二进制数据编码方案，它只占用一个内存块，只需要一次性加载一个二进制文件的方式即可，简单，快速，
#尤其对大型训练数据很友好。而且当我们的训练数据量比较大的时候，可以将数据分成多个TFRecord文件，来提高处理效率

#tfrecord里面存储的都是 tf.train.Example, Example可以是一个样本也可以是一组样本
#每个Example里面都是一个个的feature(tf.train.Features),Features里面可以看做是dicts {”key":tf.train.Feature}
#对于每一个不同的Feature都有不同的格式，包括tf.train.ByteList/FloatList/Int64List

#得到一个UTF-8的字符串列表
favorite_books = [name.encode("UTF-8") for name in ["machine learning", "cc150"]]

favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)

hours_floatList = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(hours_floatList)

age_int64list = tf.train.Int64List(value=[27])
print(age_int64list)

#定义的features有三个特征，分别为 "favorite_books" "hours" "age"
features = tf.train.Features(
    feature = {
        "favorite_books" : tf.train.Feature(bytes_list = favorite_books_bytelist),
        "hours"          : tf.train.Feature(float_list = hours_floatList),
        "age"            : tf.train.Feature(int64_list = age_int64list)
    }
)

print(features)

value: "machine learning"
value: "cc150"

value: 15.5
value: 9.5
value: 7.0
value: 8.0

value: 27

feature {
  key: "age"
  value {
    int64_list {
      value: 27
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}



In [3]:
example = tf.train.Example(features=features)
print(example)

#把example进行序列化压缩，以减小tfrecord文件的大小
serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 27
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x1b'


In [4]:
#把example存到tfrecord文件中，生成一个具体的tfrecord文件

output_dir = "tfrecord_basic"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):#往这个tfrecord文件中写三遍上面的序列化字符串
        writer.write(serialized_example)
#然后我们进入tfrecord_basic文件夹下可以看到 test.tfrecords 文件

#tf.data读取tfrecord文件
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x1b', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x1b', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x1b', shape=(), dtype=string)


In [5]:
#将序列化的字符串解析还原成example

#定义三个特征的具体类型
expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype=tf.string),
    "hours": tf.io.VarLenFeature(dtype=tf.float32),
    "age": tf.io.FixedLenFeature([], dtype=tf.int64)
}

#tf.data读取tfrecord文件
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features)
    print(example) #"favorite_books" 和 "hours" 都是 sparse_tensor

    books = tf.sparse.to_dense(example["favorite_books"], default_value=b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

#     hours = tf.sparse.to_dense(example["hours"], default_value=float32)
#     for hour in hours:
#         print(hour.numpy())

    print(example["age"].numpy())
        
#{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f56d0ffba90>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f57512479b0>, 'age': <tf.Tensor: id=46, shape=(), dtype=int64, numpy=27>}
# ”favorite_books“ 和 ”hours“ 都是 sparse tensor

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29db014d68>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29db014a90>, 'age': <tf.Tensor: id=46, shape=(), dtype=int64, numpy=27>}
machine learning
cc150
27
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d0131588>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d013a5c0>, 'age': <tf.Tensor: id=65, shape=(), dtype=int64, numpy=27>}
machine learning
cc150
27
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29db014eb8>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d01316a0>, 'age': <tf.Tensor: id=84, shape=(), dtype=int64, numpy=27>}
machine learning
cc150
27


In [6]:
#将tfrecord存储为压缩格式的文件

filename_fullpath_zip = filename_fullpath + ".zip"
options = tf.io.TFRecordOptions(compression_type="GZIP")#设置tfrecord文件的压缩格式

with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):#往这个tfrecord文件中写三遍上面的序列化字符串
        writer.write(serialized_example)
#然后我们进入tfrecord_basic文件夹下可以看到 test.tfrecords.zip 文件，该文件大小仅压缩前文件大小的一半


#读取压缩的tfrecord文件
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip],compression_type="GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features)
    print(example) #"favorite_books" 和 "hours" 都是 sparse_tensor

    books = tf.sparse.to_dense(example["favorite_books"], default_value=b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

#     hours = tf.sparse.to_dense(example["hours"], default_value=float32)
#     for hour in hours:
#         print(hour.numpy())

    print(example["age"].numpy())



{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d01444a8>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d0144710>, 'age': <tf.Tensor: id=120, shape=(), dtype=int64, numpy=27>}
machine learning
cc150
27
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29db045390>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d013a5c0>, 'age': <tf.Tensor: id=139, shape=(), dtype=int64, numpy=27>}
machine learning
cc150
27
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d0144710>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f29d0144390>, 'age': <tf.Tensor: id=158, shape=(), dtype=int64, numpy=27>}
machine learning
cc150
27
