<a href="https://colab.research.google.com/github/chaudharynidhi/Tensorflow_DataPreProcessing/blob/master/chapter_13_Q9%2C_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [None]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
X_train, y_train = X_train_full[5000:], y_train_full[5000:]
X_valid, y_valid = X_train_full[:5000], y_train_full[:5000]

In [None]:
train_shuffled = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train))
valid = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
test = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
def create_example(image, label):
  image = tf.io.serialize_tensor(image)

  return Example(features = Features(
      feature={
          "image": Feature(bytes_list = BytesList(value=[image.numpy()])),
          "label": Feature(int64_list=Int64List(value=[label])),
      }
  ))

In [None]:
Example = tf.train.Example
Features = tf.train.Features
Feature = tf.train.Feature
BytesList = tf.train.BytesList
Int64List = tf.train.Int64List

In [None]:
for image, label in train_shuffled.take(1):
  print(create_example(image, label))

features {
  feature {
    key: "image"
    value {
      bytes_list {
        value: "\010\004\022\010\022\002\010\034\022\002\010\034\"\220\006\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\00

In [None]:
from contextlib import ExitStack

def write_tfrecords(dataset, name, n_shards = 10):
  paths = ["{}.tfrecord-{:05d}-of-{:05d}".format(name, index, n_shards) for index in range(n_shards)]

  with ExitStack() as stack:
    writers = [stack.enter_context(tf.io.TFRecordWriter(path)) for path in paths]
    for index, (image,label) in dataset.enumerate():
      shard = index % n_shards
      example = create_example(image, label)
      writers[shard].write(example.SerializeToString())

  return paths

In [None]:
train_filepaths = write_tfrecords(train_shuffled, "my_train_tf_record")
valid_filepaths = write_tfrecords(valid, "my_valid_tf_record")
test_filepaths = write_tfrecords(test, "my_test_tf_record")

In [None]:
def preprocess(tfrecord):
  feature_descriptions = {
      "image": tf.io.FixedLenFeature([], tf.string, default_value=""),
      "label": tf.io.FixedLenFeature([], tf.int64, default_value = -1)
  }
  example = tf.io.parse_single_example(tfrecord, feature_descriptions)
  image = tf.io.parse_tensor(example["image"], out_type=tf.uint8)
  image = tf.reshape(image, shape=[28,28])
  return image, example['label']

def mnist_dataset(filepaths, n_read_threads=5, shuffle_buffer_size = None, n_parse_threads = 5, batch_size=32, cache=True):
  dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=n_read_threads)
  if cache:
    dataset = dataset.cache()
  if shuffle_buffer_size:
    dataset = dataset.shuffle(shuffle_buffer_size)
  dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
  dataset = dataset.batch(batch_size)
  return dataset.prefetch(1)

In [None]:
train_set = mnist_dataset(train_filepaths, shuffle_buffer_size=60000)
valid_set = mnist_dataset(valid_filepaths)
test_set = mnist_dataset(test_filepaths)

In [None]:
class Standardization(keras.layers.Layer):
  def adapt(self,data_sample):
    self.mean_ = np.mean(data_sample, axis=0, keepdims=True)
    self.variance_ = np.std(data_sample, axis=0, keepdims=True)
  def call(self, inputs):
    return (inputs-self.mean_)/(self.variance_+keras.backend.epsilon())

In [None]:
standardization = Standardization(input_shape=[28,28])

In [None]:
sample_image_train = train_set.take(1000).map(lambda image, label: image)
sample_images = np.concatenate(list(sample_image_train.as_numpy_iterator()), axis=0).astype(np.float32)

standardization.adapt(sample_images)

model = keras.Sequential([
                          standardization,
                          keras.layers.Flatten(),
                          keras.layers.Dense(100, activation='relu'),
                          keras.layers.Dense(10, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer = 'nadam', metrics = ['accuracy'])

In [None]:
model.fit(train_set, epochs=10, validation_data=valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0be52036d0>

In [3]:
from pathlib import Path

In [4]:
DOWNLOAD_ROOT = 'http://ai.stanford.edu/~amaas/data/sentiment/'
FILENAME = 'aclImdb_v1.tar.gz'
filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)
filepath

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


'/root/.keras/datasets/aclImdb_v1.tar.gz'

In [5]:
path = Path(filepath).parent / 'aclImdb'
path

PosixPath('/root/.keras/datasets/aclImdb')

In [6]:
def review_paths(dirpath):
  return [str(path) for path in dirpath.glob('*.txt')]

train_pos = review_paths(path / 'train' / 'pos')
train_neg = review_paths(path / 'train' / 'neg')
test_pos = review_paths(path / 'test' / 'pos')
test_neg = review_paths(path / 'test'/ 'neg')

In [7]:
test_pos = test_pos[:5000]
test_neg = test_neg[:5000]
valid_neg = test_neg[5000:]
valid_pos = test_pos[5000:]

In [8]:
def imdb_dataset(filepath_positive, filepath_negative, n_parallel_reads = 5):
  pos_data = tf.data.TextLineDataset(filepath_positive, num_parallel_reads=n_parallel_reads)
  neg_data = tf.data.TextLineDataset(filepath_negative, num_parallel_reads=n_parallel_reads)
  pos_data = pos_data.map(lambda review: (review, 1))
  neg_data = neg_data.map(lambda review: (review,0))

  return tf.data.Dataset.concatenate(pos_data, neg_data)

In [9]:
batch_size = 32

train_data = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)
valid_data = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
test_data = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

In [10]:
def preprocess(dataset, n_words = 50):
  shape = tf.shape(dataset)*tf.constant([1,0]) + tf.constant(n_words)*tf.constant([0,1])

  Z = tf.strings.substr(dataset, 0, 300)
  Z = tf.strings.lower(Z)
  Z = tf.strings.regex_replace(Z, b'<br\\s*/?>', b' ')
  Z = tf.strings.regex_replace(Z, b'[^a-z]', b' ')
  Z = tf.strings.split(Z)

  return Z.to_tensor(shape = shape, default_value = b'none')

In [11]:
from collections import Counter

def get_vocabulary(data_sample, max_size = 1000):
  preprocessed_reviews = preprocess(data_sample).numpy()
  counter = Counter()
  for words in preprocessed_reviews:
    for word in words:
      if word != b'none':
        counter[word]+=1
  return [b'none'] + [word for word , count in counter.most_common(max_size)]

In [12]:
class TextVectorization(keras.layers.Layer):
  def __init__(self, max_vocabulary_size=1000, n_oov_buckets=100, dtype=tf.string, **kwargs):
    super().__init__(dtype=dtype, **kwargs)
    self.max_vocabulary_size=max_vocabulary_size
    self.n_oov_buckets = n_oov_buckets
  
  def adapt(self, data_sample):
    print(1)
    self.vocab = get_vocabulary(data_sample, self.max_vocabulary_size)
    print(2)
    words = tf.constant(self.vocab)
    print(3)
    word_ids = tf.range(len(self.vocab), dtype=tf.int64)
    print(4)
    vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
    print(5)
    self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets)
  
  def call(self, inputs):
    preprocessed_data = preprocess(inputs)
    return self.table.lookup(preprocessed_data)

In [13]:
max_vocabulary_size = 1000
n_oov_buckets = 100

sample_review_batches = train_data.map(lambda review, label: review)
#print(list(sample_review_batches.as_numpy_iterator())[:20])
sample_review = np.concatenate(list(sample_review_batches.as_numpy_iterator()), axis=0)
#print(sample_review)
text_vectorization = TextVectorization(max_vocabulary_size, n_oov_buckets, input_shape=[])
text_vectorization.adapt(sample_review)

1
2
3
4
5


In [14]:
class BagofWords(keras.layers.Layer):
  def __init__(self, n_tokens, dtype=tf.int32, **kwargs):
    super().__init__(dtype=dtype, **kwargs)
    self.n_tokens = n_tokens
  
  def call(self,inputs):
    one_hot = tf.one_hot(inputs, self.n_tokens)
    return tf.reduce_sum(one_hot,  axis=1)[:, 1:]

In [15]:
n_tokens = max_vocabulary_size+n_oov_buckets+1
bag_words = BagofWords(n_tokens)

In [16]:
model = keras.models.Sequential([
    text_vectorization,
    bag_words,
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit(train_data, epochs=5, validation_data=valid_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd2a8c54f10>

In [17]:
def compute_mean_embedding(inputs):
  not_pad = tf.math.count_nonzero(inputs, axis=-1)
  n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True)
  sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
  return tf.reduce_mean(inputs, axis=1) * sqrt_n_words

In [18]:
embedding_size = 20

model = keras.models.Sequential([
                                 text_vectorization,
                                 keras.layers.Embedding(input_dim=n_tokens,
                                                        output_dim=embedding_size,
                                                        mask_zero=True),
                                 keras.layers.Lambda(compute_mean_embedding),
                                 keras.layers.Dense(100, activation='relu'),
                                 keras.layers.Dense(1, activation='sigmoid'),
])

In [23]:
model.compile(loss='binary_crossentropy', optimizer='Nadam', metrics=['accuracy'])
model.fit(train_data, epochs=5, validation_data=valid_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd2a08cbd10>

In [21]:
import tensorflow_datasets  as tfds

datasets = tfds.load('imdb_reviews')
train_set, test_set = datasets['train'], datasets['test']

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteU76I18/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteU76I18/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteU76I18/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [22]:
for example in train_set.take(1):
  print(example['text'])
  print(example['label'])

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int64)
