In [16]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import (
    Flatten,
    Dense,
    Embedding,
    GlobalAveragePooling1D,
)
import numpy as np

In [3]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

train_data, test_data = imdb['train'], imdb['test']

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/king/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Size...: 100%|██████████| 80/80 [00:19<00:00,  4.00 MiB/s]rl]
Dl Completed...: 100%|██████████| 1/1 [00:19<00:00, 19.99s/ url]
                                                                        

[1mDataset imdb_reviews downloaded and prepared to /Users/king/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
train_sentences = []
train_labels = []
test_sentences = []
test_labels = []

for s, i in train_data:
    train_sentences.append(str(s.numpy()))
    train_labels.append(i.numpy())

for s, i in test_data:
    test_sentences.append(str(s.numpy()))
    test_labels.append(i.numpy())

In [11]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [12]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'

# Using tokenizer which is pretty outdated but may as well do it
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=trunc_type)

In [18]:
model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(), # can also use Flatten() instead of this layer
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam',
            metrics=['accuracy'])

model.summary()
model.fit(padded, train_labels, epochs=10,
          validation_data=(test_padded, test_labels))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x14b2ed700>

In [19]:
# Let's look at Embeddings
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # (vocab_size, embedding_dim)

reverse_word_index = dict([(v, k) for (k, v) in word_index.items()])

(10000, 16)


In [20]:
import io
# Write embeddings to disk so we can visualise on
# https://projector.tensorflow.org/
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()