## IMDB Sentimental Classification With DNN

In [1]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [2]:
import tensorflow_datasets as tfds

In [3]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/zephyros/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling imdb_reviews-train.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling imdb_reviews-test.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling imdb_reviews-unsupervised.tfrecord...:   0%|          | 0/50000 [00:00<?, ? examples/s]

[1mDataset imdb_reviews downloaded and prepared to /home/zephyros/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [5]:
import numpy as np
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [6]:
for s, l in train_data:
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())

In [7]:
for s, l in test_data:
    testing_sentences.append(str(s.numpy()))
    testing_labels.append(l.numpy())

In [8]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [9]:
vocab_size = 10000
embedding_dim = 16
max_length= 100
trunc_type = 'post'
oov_tok = '<OOV>'

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

In [14]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type)

In [15]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [18]:
def decode_view(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [19]:
print(decode_view(padded[0]))

b this was an absolutely terrible movie don't be <OOV> in by christopher walken or michael <OOV> both are great actors but this must simply be their worst role in history even their great acting could not redeem this movie's ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the <OOV> rebels were making their cases for <OOV> maria <OOV> <OOV> appeared phony and her pseudo love affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that


In [20]:
print(training_sentences[0])

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."


In [35]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 16)           160000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 6)                 9606      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 7         
Total params: 169,613
Trainable params: 169,613
Non-trainable params: 0
_________________________________________________________________


In [36]:
num_epochs = 10
history = model.fit(padded, training_labels_final, 
                    validation_data=(testing_padded, testing_labels_final),
                    epochs=num_epochs, verbose=2
                   )

Epoch 1/10
782/782 - 2s - loss: 0.5092 - accuracy: 0.7357 - val_loss: 0.4080 - val_accuracy: 0.8108
Epoch 2/10
782/782 - 2s - loss: 0.2681 - accuracy: 0.8948 - val_loss: 0.4267 - val_accuracy: 0.8099
Epoch 3/10
782/782 - 2s - loss: 0.1195 - accuracy: 0.9662 - val_loss: 0.5167 - val_accuracy: 0.7985
Epoch 4/10
782/782 - 2s - loss: 0.0330 - accuracy: 0.9955 - val_loss: 0.6235 - val_accuracy: 0.7913
Epoch 5/10
782/782 - 2s - loss: 0.0116 - accuracy: 0.9988 - val_loss: 0.6987 - val_accuracy: 0.7949
Epoch 6/10
782/782 - 2s - loss: 0.0049 - accuracy: 0.9995 - val_loss: 0.7739 - val_accuracy: 0.7944
Epoch 7/10
782/782 - 2s - loss: 0.0023 - accuracy: 0.9998 - val_loss: 0.8405 - val_accuracy: 0.7917
Epoch 8/10
782/782 - 2s - loss: 6.9031e-04 - accuracy: 1.0000 - val_loss: 0.8936 - val_accuracy: 0.7935
Epoch 9/10
782/782 - 2s - loss: 3.4621e-04 - accuracy: 1.0000 - val_loss: 0.9406 - val_accuracy: 0.7925
Epoch 10/10
782/782 - 2s - loss: 2.0536e-04 - accuracy: 1.0000 - val_loss: 0.9860 - val_accu

In [37]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [38]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_m.close()
out_v.close()