In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=7, releaselevel='final', serial=0)
matplotlib 3.2.2
numpy 1.19.0
pandas 1.0.5
sklearn 0.23.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [6]:
# tfrecord 文件格式
# -> tf.train.Example
#    -> tf.train.Features -> {"key": tf.train.Feature}
#       -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

favorite_books = [name.encode('utf-8') for name in ["machine learning","cc150"]]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)

hours_floatlist = tf.train.FloatList(value = [20,2,3.4,5.5,8,8])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value=[32])
print(age_int64list)

features = tf.train.Features(
    feature = {"favorite_books": tf.train.Feature(bytes_list = favorite_books_bytelist),
               "hours":tf.train.Feature(float_list = hours_floatlist),
               "age": tf.train.Feature(int64_list = age_int64list)
              }
)

print(features)

value: "machine learning"
value: "cc150"

value: 20.0
value: 2.0
value: 3.4000000953674316
value: 5.5
value: 8.0
value: 8.0

value: 32

feature {
  key: "age"
  value {
    int64_list {
      value: 32
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 20.0
      value: 2.0
      value: 3.4000000953674316
      value: 5.5
      value: 8.0
      value: 8.0
    }
  }
}



In [8]:
example = tf.train.Example(features = features)
print(example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 32
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 20.0
        value: 2.0
        value: 3.4000000953674316
        value: 5.5
        value: 8.0
        value: 8.0
      }
    }
  }
}



In [9]:
serialized_example = example.SerializeToString()
print(serialized_example)

b'\nd\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01 \n%\n\x05hours\x12\x1c\x12\x1a\n\x18\x00\x00\xa0A\x00\x00\x00@\x9a\x99Y@\x00\x00\xb0@\x00\x00\x00A\x00\x00\x00A'


In [10]:
# 生成tfrecord
output_dir = 'tfrecord_basci'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecord"
filename_fullpath = os.path.join(output_dir,filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [11]:
# 读取tfrecord
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\nd\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01 \n%\n\x05hours\x12\x1c\x12\x1a\n\x18\x00\x00\xa0A\x00\x00\x00@\x9a\x99Y@\x00\x00\xb0@\x00\x00\x00A\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\nd\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01 \n%\n\x05hours\x12\x1c\x12\x1a\n\x18\x00\x00\xa0A\x00\x00\x00@\x9a\x99Y@\x00\x00\xb0@\x00\x00\x00A\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\nd\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01 \n%\n\x05hours\x12\x1c\x12\x1a\n\x18\x00\x00\xa0A\x00\x00\x00@\x9a\x99Y@\x00\x00\xb0@\x00\x00\x00A\x00\x00\x00A', shape=(), dtype=string)


In [24]:
# 解析tfrecord
expected_features = {"favorite_books": tf.io.VarLenFeature(dtype = tf.string),
                     "hours": tf.io.VarLenFeature(dtype=tf.float32),
                     "age": tf.io.FixedLenFeature([],dtype = tf.int64),
                    }
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features) #解析
    books = tf.sparse.to_dense(example["favorite_books"],default_value=b"") # sparseTensor to dense
    for book in books:
        print(book.numpy())

b'machine learning'
b'cc150'
b'machine learning'
b'cc150'
b'machine learning'
b'cc150'


In [28]:
# 生成zip
filename_fullpath_zip = filename_fullpath + ".zip"
option = tf.io.TFRecordOptions(compression_type = 'GZIP')
with tf.io.TFRecordWriter(filename_fullpath_zip,option) as writer:
    for i in range(3):
        writer.write(serialized_example)


In [29]:
# 读取zip
expected_features = {"favorite_books": tf.io.VarLenFeature(dtype = tf.string),
                     "hours": tf.io.VarLenFeature(dtype=tf.float32),
                     "age": tf.io.FixedLenFeature([],dtype = tf.int64),
                    }
dataset = tf.data.TFRecordDataset([filename_fullpath_zip],compression_type='GZIP')
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features) #解析
    books = tf.sparse.to_dense(example["favorite_books"],default_value=b"") # sparseTensor to dense
    for book in books:
        print(book.numpy())

b'machine learning'
b'cc150'
b'machine learning'
b'cc150'
b'machine learning'
b'cc150'
