In [34]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

* Hiển thị danh sách các data build-in có sẵn từ tensorflow datasets.

In [4]:
# ", ".join(tfds.list_builders())

In [5]:
imdb_train, ds_info = tfds.load(name="imdb_reviews", split='train',
                                with_info=True, as_supervised=True)

imdb_test = tfds.load(name="imdb_reviews", split='test', as_supervised=True)

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/manhcuong/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Size...: 100%|██████████| 80/80 [00:29<00:00,  2.70 MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:29<00:00, 29.65s/ url]


[1mDataset imdb_reviews downloaded and prepared to /home/manhcuong/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [6]:
print(ds_info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='/home/manhcuong/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <Spl

In [7]:
for example, label in imdb_train.take(1):
    print(f"{label} - {example}")

0 - b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."


# Normalization and vectorization

In [10]:
tokenizer = tfds.deprecated.text.Tokenizer()
vocabulary_set = set()
MAX_TOKENS = 0

for example, label in imdb_train:
    some_tokens = tokenizer.tokenize(example.numpy())
    
    if MAX_TOKENS < len(some_tokens):
        MAX_TOKENS = len(some_tokens)
        
    vocabulary_set.update(some_tokens)

In [26]:
imdb_encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set, tokenizer=tokenizer)

In [27]:
imdb_encoder

<TokenTextEncoder vocab_size=93931>

In [29]:
for example, label in imdb_train.take(1):
    print(example)
    encoded = imdb_encoder.encode(example.numpy())
    print()
    print(imdb_encoder.decode(encoded))

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)

This was an absolutely terrible movie Don t be lured in by Christopher Walken or Michael Ironside Both are great actors but this must simply be their worst role in history Even their great acting could not redeem this movie s ridiculous storyline This

In [28]:
# imdb_encoder.save_to_file("./data/reviews_vocab")
# enc = tfds.deprecated.text.TokenTextEncoder.load_from_file("./data/reviews_vocab")
# enc.decode(enc.encode("Good case. Excellent value."))

# output: Good case Excellent value

93929

In [40]:
from tensorflow.keras.preprocessing import sequence

def encode_pad_transform(sample):
    encoded = imdb_encoder.encode(sample.numpy())
    pad = sequence.pad_sequences([encoded], padding='post', maxlen=150)
    
    return np.array(pad[0], dtype=np.int64)


def encode_tf_fn(sample, label):
    encoded = tf.py_function(encode_pad_transform, inp=[sample], Tout=(tf.int64))
    encoded.set_shape([None])
    
    label.set_shape([])
    return encoded, label

In [41]:
subset = imdb_train.take(10)
tst = subset.map(encode_tf_fn)

for review, label in tst.take(1):
    print(f"{label} - {review}")
    print()
    print(imdb_encoder.decode(review))

0 - [38962 91890 53999 42581 18019  5137 16516  5567 52044 15060 36587  8118
 53284 12687 27765 89131 83077 17859 67977 59931 18583  5982 21194 10065
 38003 52044 12410 25670 23011 36587 37703  3349 12410 59931 89358 12156
  7466  1897 21194  5137 53617  3123   366 38962  5137  3152 53999 91064
 25594 57285  3332 24513 66448 20400 16689  4393 70804 38257 17738 73835
 19013 46912 70804 72398 12410 80208 25430 93867 93702 88918 30597 15103
 69547 76160 52809 87773 91075 12922 41603 12687 91890 56352  5982 27780
 16689 64748 12303 36587 27780  5137 93514 91890 66894 80017 22677 33794
   657  1331 73140 35793 93514 17192 67977 44687 13486 21194  4666 62588
 53617 13486 53284 12687 53617  9092 93346  1331 12156 75541 41886 30749
 53455     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0]

This was an absolutely terrible movie Don t be lured in by Christopher Walken or 

In [44]:
encoded_train = imdb_train.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
encode_test = imdb_test.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [45]:
def build_model_lstm(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

In [46]:
vocab_size = imdb_encoder.vocab_size
embedding_dim = 64
rnn_units = 64
BATCH_SIZE = 100

model = build_model_lstm(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (100, None, 64)           6011584   
_________________________________________________________________
lstm (LSTM)                  (100, 64)                 33024     
_________________________________________________________________
dense (Dense)                (100, 1)                  65        
Total params: 6,044,673
Trainable params: 6,044,673
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', 'Precision', 'Recall'])

encoded_train_batched = encoded_train.batch(BATCH_SIZE)
model.fit(encoded_train_batched, epochs=10)

Epoch 1/10
