# Importing Libraries

In [22]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download & Explore Dataset

In [4]:
imdb, metadata = tfds.load('imdb_reviews',
                           with_info=True,
                           as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFV7E3I/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFV7E3I/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFV7E3I/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [5]:
metadata

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [9]:
print(metadata.name)

num_classes = metadata.features['label'].num_classes
print(num_classes)

imdb_reviews
2


In [13]:
class_names = metadata.features['label'].names
print(class_names)

['neg', 'pos']


In [11]:
imdb

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [10]:
train_data, test_data = imdb['train'], imdb['test']

In [14]:
len(train_data), len(test_data)

(25000, 25000)

In [19]:
train_sentences = []
train_labels = []

test_sentences = []
test_labels = []

for sent, label in train_data:
  train_sentences.append(sent.numpy().decode('utf8'))
  train_labels.append(label.numpy())

for sent, label in test_data:
  test_sentences.append(sent.numpy().decode('utf8'))
  test_labels.append(label.numpy())


train_labels_final = np.array(train_labels)
test_labels_final = np.array(test_labels)

In [20]:
len(train_labels_final), len(test_labels_final)

(25000, 25000)

# Tokenization & Sequence Padding

In [21]:
vocab_size = 10000
embedding_dim = 16
trunc_type = 'post'
oov_token = '<OOV>'
max_length = 120

In [25]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_sentences)

word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(train_sentences)
padded_seq = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded_seq = pad_sequences(test_sequences, maxlen=max_length)

In [29]:
len(word_index)

88583

In [27]:
padded_seq[0]

array([   0,    0,    0,   12,   14,   33,  425,  392,   18,   90,   28,
          1,    9,   32, 1366, 3585,   40,  486,    1,  197,   24,   85,
        154,   19,   12,  213,  329,   28,   66,  247,  215,    9,  477,
         58,   66,   85,  114,   98,   22, 5675,   12, 1322,  643,  767,
         12,   18,    7,   33,  400, 8170,  176, 2455,  416,    2,   89,
       1231,  137,   69,  146,   52,    2,    1, 7577,   69,  229,   66,
       2933,   16,    1, 2904,    1,    1, 1479, 4940,    3,   39, 3900,
        117, 1584,   17, 3585,   14,  162,   19,    4, 1231,  917, 7917,
          9,    4,   18,   13,   14, 4139,    5,   99,  145, 1214,   11,
        242,  683,   13,   48,   24,  100,   38,   12, 7181, 5515,   38,
       1366,    1,   50,  401,   11,   98, 1197,  867,  141,   10],
      dtype=int32)

In [28]:
test_padded_seq[0]

array([  11,  772, 1498,   12,  252,  235,   11,  217,    2,  366, 6454,
          3,   58,   93,   11,   90,  102,   11, 1498,  177,   12,  252,
         36,    6, 1126,    1,  674,    7, 4387,    1,    4,    1,  327,
          7,   36, 8300,  366,    5, 1403,    1,   13,   29,   60,   26,
          6,  867,  178,   17,    4, 1037,    5,   12,  227,    3,   79,
          4,  345,   32,  345, 5159,    5,   10,    6, 1314, 1143,    2,
       5619,    1,    3,    1,    5,   10,  173,  322,    7, 1293, 3938,
          4,  788, 1909,    5,    4,  250, 2673,  165,    3,    2,  352,
         30,  185,   24, 1154,  223,  599,    5,    2,  118,    2,  348,
       1382, 7675,   29,    1,  871,   37,    4,   20,   38,   12,    1,
          4,    1,  327,    7,    4,   20,  624,   56,   46,  214],
      dtype=int32)

In [30]:
reverse_word_index = dict([(v, k) for (k, v) in word_index.items()])
len(word_index), len(reverse_word_index)

(88583, 88583)

In [32]:
reverse_word_index[1], word_index['<OOV>']

('<OOV>', 1)

In [33]:
def decode_review(text):
  sentence = ' '.join([reverse_word_index.get(i, '?') for i in text])
  return sentence

In [34]:
train_sentences[2]

'Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Mountie telling the people of Dawson City, Yukon to elect themselves a marshal (yes a marshal!) and to enforce the law themselves, then gunfighters battling it out on the streets for control of the town? <br /><br />Nothing even remotely resembling that happened on the Canadian side of the border during the Klondike gold rush. Mr. Mann and company appear to have mistaken Dawson City for Deadwood, the Canadian North for the American Wild West.<br /><br />Canadian viewers be prepared for a Reefer Madness type of enjoyable howl with this ludicrous plot, or, to shake your head in disgust.'

In [35]:
decode_review(padded_seq[2])

'mann photographs the <OOV> rocky mountains in a superb fashion and jimmy stewart and walter brennan give enjoyable performances as they always seem to do br br but come on hollywood a <OOV> telling the people of dawson city <OOV> to <OOV> themselves a <OOV> yes a <OOV> and to <OOV> the law themselves then <OOV> battling it out on the streets for control of the town br br nothing even remotely resembling that happened on the canadian side of the border during the <OOV> gold rush mr mann and company appear to have mistaken dawson city for <OOV> the canadian north for the american wild west br br canadian viewers be prepared for a <OOV> madness type of enjoyable'

# Create & Train the Model

In [36]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,
                              embedding_dim,
                              input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [37]:
model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [39]:
num_epochs = 10

history = model.fit(x=padded_seq,
                    y=train_labels_final,
                    epochs=num_epochs,
                    validation_data=(test_padded_seq, test_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
model.evaluate(test_padded_seq, test_labels_final)



[0.8058513402938843, 0.8311200141906738]

# To visualize Embedding

In [41]:
e = model.layers[0]
e

<keras.layers.embeddings.Embedding at 0x7fc9ce733f50>

In [46]:
e.get_weights()[0]

array([[-0.01138299, -0.00030242,  0.0108454 , ...,  0.01391922,
        -0.0675709 , -0.01821198],
       [-0.02216356, -0.0008537 , -0.02369728, ..., -0.03723429,
        -0.11399993, -0.00278032],
       [-0.02517587,  0.04958963,  0.06942984, ..., -0.04436871,
        -0.08609332, -0.00736511],
       ...,
       [-0.05549402, -0.01041661,  0.02973922, ..., -0.00698989,
        -0.03028785, -0.08515079],
       [-0.09406087, -0.06975809,  0.07446248, ...,  0.09377393,
        -0.01708088, -0.11189012],
       [-0.11059123, -0.01811633,  0.06330667, ..., -0.04902273,
        -0.01390117, -0.13614036]], dtype=float32)

In [44]:
weights = e.get_weights()[0]
weights.shape

(10000, 16)

In [47]:
# Get vectors and metadata

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
# To download these files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

# Test the Model

In [56]:
sentence = ["I really think this is amazing. honest.", 
            "It is a limited budget Indie action film that has the look, feel, and heartfelt acting of a high-quality big budget movie",
            "I watch a lot of movies and I like to give them all a chance just in case there is something interesting or exciting to warrant a viewing Unfortunately this movie has none of these features it is pointless and offers nothing in the way of story line,acting or direction The plot is non-existent with the actors just going through the motions and the dialogue is sooo boring its embarrassing. I wish the previous reviewers had posted earlier as this would have saved me 95 mins of my time"]

test_seq = tokenizer.texts_to_sequences(sentence)
print(test_seq)

[[11, 64, 102, 12, 7, 478, 1200], [10, 7, 4, 1761, 350, 2686, 204, 20, 13, 46, 2, 166, 233, 3, 5342, 114, 5, 4, 310, 487, 192, 350, 18], [11, 104, 4, 174, 5, 100, 3, 11, 38, 6, 200, 96, 30, 4, 578, 41, 9, 418, 48, 7, 140, 219, 40, 1124, 6, 8151, 4, 827, 470, 12, 18, 46, 599, 5, 132, 942, 10, 7, 1148, 3, 1580, 162, 9, 2, 94, 5, 63, 345, 114, 40, 456, 2, 112, 7, 697, 2965, 17, 2, 154, 41, 168, 141, 2, 7019, 3, 2, 413, 7, 1, 355, 92, 2268, 11, 655, 2, 958, 1987, 67, 5563, 906, 15, 12, 60, 26, 1892, 70, 5968, 7470, 5, 59, 56]]


In [57]:
pad_sent = pad_sequences(test_seq, maxlen=max_length, truncating=trunc_type)
print(pad_sent)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0   11   64  102   12    7  478 1200]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0

In [58]:
model.predict(pad_sent)

array([[9.8583376e-01],
       [9.9387133e-01],
       [1.7252867e-07]], dtype=float32)