<a href="https://colab.research.google.com/github/gimquokka/ML/blob/master/TF_Tutorial_Text_classification_with_TF_hub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification with TF hub: Movie reviews

## Import essensial libraries

In [19]:
import numpy as np
import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds

print('Version: ', tf.__version__)
print('Eager mode: ', tf.executing_eagerly())
print('Hub version: ', hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.2.0
Eager mode:  True
Hub version:  0.8.0
GPU is available


## Download the IMDB detaset

In [0]:
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews",
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

## Explore the data

In [21]:
# It is much better than using list()
train_len = tf.data.experimental.cardinality(train_data).numpy()
validation_len = tf.data.experimental.cardinality(validation_data).numpy()

print('length of train_data: ', tf.data.experimental.cardinality(train_data).numpy())
print('length of validation_data: ', tf.data.experimental.cardinality(validation_data).numpy())
print('length of test_data: ', tf.data.experimental.cardinality(test_data).numpy())

print('training/validation ratio: ', train_len/(train_len+validation_len))

length of train_data:  15000
length of validation_data:  10000
length of test_data:  25000
training/validation ratio:  0.6


In [22]:
'''
# work but, It's not a good Idea... 
# Use too many resoure!

print(len(list(train_data)))
print(len(list(validation_data)))
print(len(list(test_data)))
'''

"\n# work but, It's not a good Idea... \n# Use too many resoure!\n\nprint(len(list(train_data)))\nprint(len(list(validation_data)))\nprint(len(list(test_data)))\n"

In [23]:
# Whoo... Take a long time...
# Convert tf dataset to numpy

train_comments = np.array([], dtype='bytes')
train_labels = np.array([])

for comments, labels in train_data.take(3):
  # print(type(comments.numpy()))
  train_comments = np.append(train_comments, comments.numpy())
  train_labels = np.append(train_labels, labels.numpy())

print(train_labels.shape)

(3,)


In [0]:
train_comments_batch, train_labels_batch = next(iter(train_data.batch(3)))

In [25]:
print('train_comments_batch \n', train_comments_batch.numpy())
print()
print('train_labels_batch \n', train_labels_batch.numpy())

train_comments_batch 
 [b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
 b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish.

## Build the model

In [26]:
embedding = 'https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1'

hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable = True, input_shape = [], name = 'words_embedding')
hub_layer(train_comments_batch)

<tf.Tensor: shape=(3, 128), dtype=float32, numpy=
array([[ 9.01966274e-01, -4.83913347e-03,  1.17907055e-01,
         3.81319046e-01,  6.57222793e-02, -3.01581532e-01,
         8.90584365e-02, -2.69034863e-01, -8.51345584e-02,
         1.08877886e-02, -6.66372627e-02, -3.73063087e-01,
        -2.76447266e-01, -1.87254980e-01,  5.67507632e-02,
         9.09779966e-02, -6.24961555e-02, -3.28687276e-03,
        -3.08512092e-01,  3.78482223e-01,  7.62880966e-02,
         1.43733576e-01, -1.12897493e-01,  9.59761534e-03,
        -2.38938913e-01,  2.93743908e-02,  7.28663057e-02,
        -2.48727947e-02, -8.16893280e-02,  6.68320432e-02,
        -5.62225394e-02,  2.47078985e-01,  1.17681175e-01,
         3.17581035e-02,  2.65932620e-01, -1.37706831e-01,
        -1.50708258e-01, -1.63614675e-01, -1.51269153e-01,
         2.34616160e-01, -9.12236273e-02, -4.22684886e-02,
        -1.01224177e-01, -2.12229744e-01,  6.74503446e-02,
         1.85163647e-01,  3.62982228e-02, -3.50210071e-01,
      

In [27]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(10, activation='relu', name ='first_dense_layer'))
model.add(tf.keras.layers.Dense(1))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
words_embedding (KerasLayer) (None, 128)               124642688 
_________________________________________________________________
first_dense_layer (Dense)    (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 124,643,989
Trainable params: 124,643,989
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [0]:
# Visualize result with Tensorboard
%load_ext tensorboard

import datetime
!rm -rf ./logs/
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


history = model.fit(train_data.shuffle(10000).batch(500),
                    epochs=5,
                    validation_data=validation_data.batch(500),
                    callbacks=[tensorboard_callback],
                    verbose=1)
%tensorboard --logdir logs/fit

In [31]:
results = model.evaluate(test_data.batch(300), verbose=1)

for name, val in zip(model.metri, results):
  print('%s: %.3f' % (name, val))


loss: 0.398
accuracy: 0.853


# Test Zone

In [32]:
tf.config.experimental.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]