In [1]:
# Import Needed Libraries
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load the dataset
imdb,info = tfds.load('imdb_reviews',with_info = True,as_supervised = True)

In [3]:
# Get the train and test data
train_data = imdb['train']
test_data = imdb['test']
print(type(train_data))

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>


In [4]:
# Change the tensors in our data to numpy arrays and futher to string in case of text
train_sentences = []
train_labels = []
test_sentences = []
test_labels = []
for s,l in train_data:
    train_sentences.append(str(s.numpy()))
    train_labels.append(l.numpy())
for s,l in test_data:
    test_sentences.append(str(s.numpy()))
    test_labels.append(l.numpy())

In [5]:
# See an example
train_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [6]:
# Change labels to numpy array to be able to feed it to the model
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [7]:
# Hyper-params and tokenizer Initialization 
vocab_size = 10000
trunc_type = 'post'
padd_type = 'post'
embed_dims = 32
max_len = 120
oov_tok = '<OOV>'

tokenizer = Tokenizer(num_words = vocab_size,oov_token = oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index =  tokenizer.word_index

In [8]:
# Change data to sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [9]:
# Make all the examples of same length : truncate or pad if neccesary
train_padded = pad_sequences(train_sequences,maxlen =max_len,truncating = trunc_type, padding = padd_type)
test_padded = pad_sequences(test_sequences,maxlen =max_len,truncating = trunc_type, padding = padd_type)

In [10]:
# Define the model
model = tf.keras.models.Sequential([tf.keras.layers.Embedding(vocab_size,embed_dims,input_length = max_len),
                                   tf.keras.layers.GlobalAveragePooling1D(),
                                   tf.keras.layers.Dense(32,activation='relu'),
                                   tf.keras.layers.Dense(1,activation='sigmoid')])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 32)           320000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 321,089
Trainable params: 321,089
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Compile the model
model.compile(loss= 'binary_crossentropy',optimizer='adam',metrics = ['accuracy'])

In [12]:
# Train
history = model.fit(train_padded,train_labels,epochs=10,validation_data = (test_padded,test_labels) )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
# Let's get the embedding matrix(weights of the embedding layer)
all_layers = model.layers
needed_layer = all_layers[0]
weight = needed_layer.get_weights()[0]
print(weight.shape) # Shape will be dimension as of number of features(size of dictionary) and embedding dimensions

(10000, 32)


In [25]:
print(weight[0])

[ 0.0973591   0.00079878  0.03082024  0.0144355   0.01587866 -0.02463027
 -0.04811297 -0.04403329 -0.0025988  -0.06331483 -0.01936501 -0.11508726
 -0.01364971 -0.04665851  0.02233699 -0.01682792  0.03840815 -0.02457084
  0.02571667 -0.01190272  0.01988457  0.16575535 -0.06886932  0.01201383
  0.05539486 -0.02649903  0.07671788  0.02958187 -0.01838448  0.028827
  0.02292221  0.00339327]


In [23]:
reverse_word_index = dict([(index,word) for (word,index) in word_index.items()])

In [30]:
# We can project the data at - projector.tensorflow.org
import io
out_v = io.open('vecs.tsv','w',encoding='utf-8')
out_m = io.open('meta.tsv','w',encoding='utf-8')
for word in range(1,vocab_size):
    curr_word = reverse_word_index[word]
    embeddings = weight[word]
    out_m.write(curr_word+"\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()