In [1]:
import io

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds

print(tf.__version__)

2.0.0


In [2]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [3]:
train_data, test_data = imdb['train'], imdb['test']
training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

for sent, lab in train_data:
    training_sentences.append(str(sent.numpy()))
    training_labels.append(lab.numpy())
for sent, lab in test_data:
    testing_sentences.append(str(sent.numpy()))
    testing_labels.append(lab.numpy())

In [4]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [5]:
VOCAB_SIZE = 10000
EMBEDDING_DIM = 16
MAX_LEN = 120
TRUNC_TYPE = 'post'
OOV_TOK = '<OOV>'

In [6]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=MAX_LEN, truncating=TRUNC_TYPE)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=MAX_LEN)

In [7]:
reverse_word_index = {v: k for (k, v) in word_index.items()}

In [8]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [9]:
print(decode_review(padded[1]))
print(training_sentences[1])

b oh yeah jenna jameson did it again yeah baby this movie rocks it was one of the 1st movies i saw of her and i have to say i feel in love with her she was great in this move br br her performance was outstanding and what i liked the most was the scenery and the wardrobe it was amazing you can tell that they put a lot into the movie the girls cloth were amazing br br i hope this comment helps and u can buy the movie the storyline is awesome is very unique and i'm sure u are going to like it jenna amazed us once more and no wonder the movie won so many
b"Oh yeah! Jenna Jameson did it again! Yeah Baby! This movie rocks. It was one of the 1st movies i saw of her. And i have to say i feel in love with her, she was great in this move.<br /><br />Her performance was outstanding and what i liked the most was the scenery and the wardrobe it was amazing you can tell that they put a lot into the movie the girls cloth were amazing.<br /><br />I hope this comment helps and u can buy the movie, the

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy', optimzer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [11]:
EPOCHS = 10

In [12]:
model.fit(padded, 
          training_labels_final, 
          epochs=EPOCHS, 
          validation_data=(testing_padded, testing_labels_final))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x646968210>

In [13]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [14]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, VOCAB_SIZE):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_v.close()
out_m.close()

In [15]:
!head -5 vecs.tsv

-0.04745014	0.0026025411	0.05596583	-0.059168953	-0.06283121	0.020616163	-0.008357188	-0.046138056	0.05781578	-0.057713524	-0.053424075	0.03596576	0.0271413	-0.03916733	-0.040071506	0.005358046
-0.006396133	-0.021437377	-0.04599807	0.06988949	-0.047293503	-0.032911733	0.0002758297	-0.018675473	0.11079027	-0.10474132	-0.054565985	0.10754361	0.01289633	0.0105396295	-0.045670807	0.07156635
0.051415134	-0.07951907	-0.06899786	0.006266695	-0.0070972154	0.06113209	-0.004937467	-0.040026344	-0.036127765	-0.044795144	-0.06647143	0.020407505	0.01597408	0.00060057343	-0.019641155	0.051893827
-0.0067786705	-0.00855262	-0.05882022	-0.010855984	0.031774197	0.09475773	-0.08572399	-0.000541135	0.053499445	-0.06679244	0.0016534656	0.073197715	0.00010626367	0.079385936	-0.043730047	-0.007807167
-0.042936057	0.0073939394	0.036284443	-0.01556435	-0.046386458	-0.0025273438	-0.04802028	0.03071915	-0.056327276	-0.05552082	0.005739535	0.1064435	0.025701087	-0.06717295	-0.047799245	0.034542147


In [16]:
!head -5 meta.tsv

<OOV>
the
and
a
of


In [17]:
sent = 'I really think this is just the best. swear to jesus.'
seq = tokenizer.texts_to_sequences(sent)
seq

[[11],
 [],
 [1430],
 [968],
 [4],
 [1537],
 [1537],
 [4738],
 [],
 [790],
 [2015],
 [11],
 [2922],
 [2191],
 [],
 [790],
 [2015],
 [11],
 [579],
 [],
 [11],
 [579],
 [],
 [1382],
 [1221],
 [579],
 [790],
 [],
 [790],
 [2015],
 [968],
 [],
 [59],
 [968],
 [579],
 [790],
 [],
 [],
 [579],
 [2012],
 [968],
 [4],
 [1430],
 [],
 [790],
 [1005],
 [],
 [1382],
 [968],
 [579],
 [1221],
 [579],
 []]