In [1]:
import numpy as np
import tensorflow_datasets as tfds
#loading imdb reviews data. Needed only the first time.
imdb, info = tfds.load('imdb_reviews',with_info=True, as_supervised=True)

In [2]:
train_data, test_data=imdb['train'], imdb['test']

In [3]:
training_sentences=[]
training_labels=[]
testing_sentences=[]
testing_labels=[]

for s,l in train_data:
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())
for s,l in test_data:
    testing_sentences.append(str(s.numpy()))
    testing_labels.append(l.numpy())

In [4]:
training_labels_final=np.array(training_labels)
testing_labels_final=np.array(testing_labels)

In [5]:
vocab_size=10000
embedding_dim=16
trunc_type='post'
max_length=120
oov_token='<OOV>'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index=tokenizer.word_index

vocabulary=len(list(word_index.items()))
print("Unique Words:"+str(vocabulary))
print('Word indices:'+str([(k,v) for k,v in list(word_index.items())[100:105]])+'\n')

sequences=tokenizer.texts_to_sequences(training_sentences)
print('\Sentence:'+training_sentences[0]+'\nSequence:'+str(sequences[0]))

padded=pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)
print('\Padded Sequence:'+ str(padded[0]))

testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences,maxlen=max_length)

Unique Words:86539
Word indices:[('movies', 101), ('any', 102), ("it's", 103), ('after', 104), ('think', 105)]

\Sentence:b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
Sequence:[59, 12, 14, 35, 439, 400, 18, 174, 29, 1, 9, 33, 1378, 3401, 42, 496, 1, 197, 25, 88, 156, 19, 12, 211, 340, 29, 70, 248, 213, 9, 486, 62, 70, 88, 116, 99,

In [6]:
import tensorflow as tf

model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [7]:
padded.shape

(25000, 120)

In [8]:
testing_padded.shape

(25000, 120)

In [9]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(padded, training_labels_final, epochs=10, validation_data=(testing_padded, testing_labels_final))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1346385d0>

In [11]:
embed_layer = model.layers[0]
weights = embed_layer.get_weights()[0]
print(weights.shape)

(10000, 16)


In [12]:
import io

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [13]:
new_sents = [
    'This movie was awesome. Really loved it.',
    'I almost fell asleep watching this.',
    'This movie is so stupid.',
    'I wouldn\'t watch this movie unless there\'s a gun pointed at me.',
    'This movie is so hilarious. Much love to the director!'
]
new_seq = tokenizer.texts_to_sequences(new_sents)
padded=pad_sequences(new_seq, maxlen=max_length,truncating=trunc_type)
output=model.predict(padded)
for i in range(0,len(new_sents)):
    print('Review:'+new_sents[i]+' '+'Sentiment:'+str(output[i])+'\n')

Review:This movie was awesome. Really loved it. Sentiment:[0.9884066]

Review:I almost fell asleep watching this. Sentiment:[0.13280706]

Review:This movie is so stupid. Sentiment:[0.10917035]

Review:I wouldn't watch this movie unless there's a gun pointed at me. Sentiment:[0.01397293]

Review:This movie is so hilarious. Much love to the director! Sentiment:[0.93647313]

