In [2]:
import tensorflow_datasets as tfds
imdb, info= tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [18]:
import numpy as np
#25000 for training and 25000 for testing
train_data, test_data= imdb['train'], imdb['test']

training_sentences= []
training_labels=[]
testing_sentences= []
testing_labels= []

#label=1 represents positive review and label=1 ==> -ve review
#each loops over 25000 iterables that contain sentences and labels as TENSORS
for s,l in train_data:
    #here s and l are tensors so before appending into lists we convert them into 
    #numpyBYTES and numpyINT respectively
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())
for s,l in test_data:
    testing_sentences.append(str(s.numpy()))
    testing_labels.append(l.numpy())

training_labels_final= np.array(training_labels)
testing_labels_final= np.array(testing_labels)

In [32]:
#PREPROCESSING DATA I.E. converting text into sequence of numbers using Tokenizer
vocab_size=10000    #assuming max unique words 10000. if there are more than 10000 words they will be ignored
embedding_dim=16    #basically vector (word representation) will be in 16 dimensions
max_length=120      #max width of padded matrix (i.e. truncated width of each sentence)
trunc_type='post'
oov_tok= "<OOV>"   #token given to words in sequence which are not encountered before while fitting

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#TOKENIZER FITTING ON TRAINING TEXTS ONLY ###################
tokenizer= Tokenizer(num_words= vocab_size, oov_token= oov_tok)
tokenizer.fit_on_texts(training_sentences)
#word index returns dictionary with key->word, value->token given to that word
word_index= tokenizer.word_index
#getting encoded code for sentences
sequences= tokenizer.texts_to_sequences(training_sentences)


#input size feeded to NN should be uniform. For that PADDING is done!
#list of sentences have been padded out into sentences

#matrix width = length of longest sentence. you can overwrite that with 'maxlen' parameter
#eg:- if you want your sentences to have only 5 words then maxlen=5. it truncates from beginning (pre)
#you can change that to 'post' to truncate the words from last
padded= pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
print(padded.shape)

testing_sequences= tokenizer.texts_to_sequences(testing_sentences)
testing_padded= pad_sequences(testing_sequences, maxlen=max_length)

(25000, 120)


In [20]:
reverse_word_index= dict([(value,key) for (key,value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

#decode review this is what is fed in NN
print(decode_review(padded[1]))
print(training_sentences[1])

? ? ? ? ? ? ? b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the <OOV> and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development was constant constantly slow and boring things seemed to happen but with no explanation of what was causing them or why i admit i may have missed part of the film but i watched the majority of it and everything just seemed to happen of its own <OOV> without any real concern for anything else i cant recommend this film at all '
b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of 

In [21]:
#DEFINING THE MODEL
#here LSTM and GRU are type RNN. Try using them one by one in place of Flatten layer
import tensorflow as tf
model= tf.keras.Sequential([ 
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    #tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation= 'relu'),
    tf.keras.layers.Dense(1, activation= 'sigmoid')])

model.compile(loss='binary_crossentropy', optimizer= 'adam', metrics= ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [22]:
num_epochs=10
model.fit(padded, training_labels_final, epochs= num_epochs, validation_data=(testing_padded, testing_labels_final))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2aa8b76b38>

In [25]:
#what we'll do to feel this in embeding projector
e=model.layers[0]   #taking output of my Embedding layer
weights= e.get_weights()[0]
print(weights.shape)  #shape= (vocab_size, embedding_size)

[[ 0.01769426 -0.01639816 -0.01922341 ... -0.02487399 -0.02925328
   0.01707141]
 [ 0.06581753 -0.05216534 -0.02537935 ... -0.05186357  0.00225915
  -0.02025491]
 [ 0.01910185 -0.12874885 -0.03501788 ... -0.06317361  0.03781614
  -0.07741185]
 ...
 [ 0.01552939 -0.18754904  0.0100338  ... -0.0508533  -0.0367876
   0.09208517]
 [ 0.00716479 -0.07919207  0.04087885 ...  0.02757098  0.05401395
   0.05520029]
 [ 0.06409617 -0.06542336  0.07535129 ...  0.13620397 -0.0413507
   0.02367678]]
(10000, 16)


In [31]:
#to get value of 16 dimensions for each word and write that in "out_v"
#"out_m" contains actual word associated with that 16D vector
import io

out_v= io.open('vecs.tsv', 'w', encoding= 'utf-8')
out_m= io.open('meta.tsv', 'w', encoding= 'utf-8')
for word_num in range(1, vocab_size):
    word= reverse_word_index[word_num]
    embeddings= weights[word_num]
    out_m.write(word + "\a")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

[ 0.06581753 -0.05216534 -0.02537935 -0.12395547 -0.05069927  0.0150993
  0.08432467 -0.04918864 -0.02570084 -0.05915948  0.03602956 -0.02155136
 -0.01719932 -0.05186357  0.00225915 -0.02025491]


In [None]:
""""now vecs.tsv and meta.tsv have been made into your directory on PC
now go to link- 'projector.tensorflow.org'
then from left panel load vector file (vector for each word) and 
meta file (actual word). then see clusters have been formed for +ve
and -ve reveiw words"""