In [6]:
import tensorflow_datasets as tfds

In [7]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [9]:
import numpy as np

train_data, test_data = imdb['train'], imdb['test']

In [26]:
train_sentences = []
test_sentences = []

train_labels = []
test_labels = []

for sentence, label in train_data:
    train_sentences.append(str(sentence.numpy()))
    train_labels.append(label.numpy())

for sentence, label in test_data:
    test_sentences.append(str(sentence.numpy()))
    test_labels.append(label.numpy())

In [27]:
train_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [28]:
train_labels[0]

0

In [29]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [31]:
vocab_size = 10000
embedding_dim = 64
max_length = 200

In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [42]:
tokenizer = Tokenizer(num_words= vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences) # Build dictionary based on frequency of words

In [43]:
train_sequences = tokenizer.texts_to_sequences(train_sentences) # Based on the dictionary, look up the word index in dictionary

In [46]:
# If sentences too long, cut off else add more 0
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_length, truncating='post', padding='post')

In [47]:
padded_train_sequences.shape

(25000, 200)

In [48]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [49]:
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_length, truncating='post', padding='post')

In [51]:
padded_test_sequences.shape

(25000, 200)

In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

In [53]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten

In [56]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Flatten(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])


In [57]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 64)           640000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                128010    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 768,021
Trainable params: 768,021
Non-trainable params: 0
_________________________________________________________________


In [58]:
model.fit(padded_train_sequences, train_labels, epochs=10, validation_data=(padded_test_sequences, test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e1298ea8e0>

## Test model

In [79]:
results = {0: "negative", 1: "positive"}

In [80]:
test_input = [input()]
test_input_sequence = tokenizer.texts_to_sequences(test_input)
padded_test_input_sequence = pad_sequences(test_input_sequence, maxlen=max_length, truncating='post', padding='post')
if(model.predict(padded_test_input_sequence)[0][0] < 0.5):
    print(results[0])
else:
    print(results[1])

positive
