<a href="https://colab.research.google.com/github/chaudharynidhi/Tensorflow_DataPreProcessing/blob/master/chapter_13_Q9%2C_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [None]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
X_train, y_train = X_train_full[5000:], y_train_full[5000:]
X_valid, y_valid = X_train_full[:5000], y_train_full[:5000]

In [None]:
train_shuffled = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train))
valid = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
test = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
def create_example(image, label):
  image = tf.io.serialize_tensor(image)

  return Example(features = Features(
      feature={
          "image": Feature(bytes_list = BytesList(value=[image.numpy()])),
          "label": Feature(int64_list=Int64List(value=[label])),
      }
  ))

In [None]:
Example = tf.train.Example
Features = tf.train.Features
Feature = tf.train.Feature
BytesList = tf.train.BytesList
Int64List = tf.train.Int64List

In [None]:
for image, label in train_shuffled.take(1):
  print(create_example(image, label))

features {
  feature {
    key: "image"
    value {
      bytes_list {
        value: "\010\004\022\010\022\002\010\034\022\002\010\034\"\220\006\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\00

In [None]:
from contextlib import ExitStack

def write_tfrecords(dataset, name, n_shards = 10):
  paths = ["{}.tfrecord-{:05d}-of-{:05d}".format(name, index, n_shards) for index in range(n_shards)]

  with ExitStack() as stack:
    writers = [stack.enter_context(tf.io.TFRecordWriter(path)) for path in paths]
    for index, (image,label) in dataset.enumerate():
      shard = index % n_shards
      example = create_example(image, label)
      writers[shard].write(example.SerializeToString())

  return paths

In [None]:
train_filepaths = write_tfrecords(train_shuffled, "my_train_tf_record")
valid_filepaths = write_tfrecords(valid, "my_valid_tf_record")
test_filepaths = write_tfrecords(test, "my_test_tf_record")

In [None]:
def preprocess(tfrecord):
  feature_descriptions = {
      "image": tf.io.FixedLenFeature([], tf.string, default_value=""),
      "label": tf.io.FixedLenFeature([], tf.int64, default_value = -1)
  }
  example = tf.io.parse_single_example(tfrecord, feature_descriptions)
  image = tf.io.parse_tensor(example["image"], out_type=tf.uint8)
  image = tf.reshape(image, shape=[28,28])
  return image, example['label']

def mnist_dataset(filepaths, n_read_threads=5, shuffle_buffer_size = None, n_parse_threads = 5, batch_size=32, cache=True):
  dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=n_read_threads)
  if cache:
    dataset = dataset.cache()
  if shuffle_buffer_size:
    dataset = dataset.shuffle(shuffle_buffer_size)
  dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
  dataset = dataset.batch(batch_size)
  return dataset.prefetch(1)

In [None]:
train_set = mnist_dataset(train_filepaths, shuffle_buffer_size=60000)
valid_set = mnist_dataset(valid_filepaths)
test_set = mnist_dataset(test_filepaths)

In [None]:
class Standardization(keras.layers.Layer):
  def adapt(self,data_sample):
    self.mean_ = np.mean(data_sample, axis=0, keepdims=True)
    self.variance_ = np.std(data_sample, axis=0, keepdims=True)
  def call(self, inputs):
    return (inputs-self.mean_)/(self.variance_+keras.backend.epsilon())

In [None]:
standardization = Standardization(input_shape=[28,28])

In [None]:
sample_image_train = train_set.take(1000).map(lambda image, label: image)
sample_images = np.concatenate(list(sample_image_train.as_numpy_iterator()), axis=0).astype(np.float32)

standardization.adapt(sample_images)

model = keras.Sequential([
                          standardization,
                          keras.layers.Flatten(),
                          keras.layers.Dense(100, activation='relu'),
                          keras.layers.Dense(10, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer = 'nadam', metrics = ['accuracy'])

In [None]:
model.fit(train_set, epochs=10, validation_data=valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0be52036d0>