**General Work Process**
1. Import dataset and preprocess
2. Train model
3. Test model

In [100]:
import io
import os
import re
import shutil
import string
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras import Sequential, layers, losses
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [4]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [8]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [9]:
# view train data files
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [10]:
# clean unnecessary empty folder
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [11]:
batch_size = 1024
seed = 10

train_data = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size, 
    validation_split=0.2,
    subset='training', 
    seed=seed)

val_data = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2,
    subset='validation', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [20]:
# sample batch from train data
for text_batch, label_batch in train_data.take(1):
    
    # view the first 5 samples
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

1 b"This film is more about how children make sense of the world around them, and how they (and we) use myth to make sense of it all. I think it's been misperceived, everyone going in expecting a stalkfest won't enjoy it but if you want a deeper story, it's here......."
0 b'God, I was bored out of my head as I watched this pilot. I had been expecting a lot from it, as I\'m a huge fan of James Cameron (and not just since "Titanic", I might add), and his name in the credits I thought would be a guarantee of quality (Then again, he also wrote the leaden Strange Days..). But the thing failed miserably at grabbing my attention at any point of its almost two hours of duration. In all that time, it barely went beyond its two line synopsis, and I would be very hard pressed to try to figure out any kind of coherent plot out of all the mess of strands that went nowhere. On top of that, I don\'t think the acrobatics outdid even those of any regular "A-Team" episode. As for Alba, yes, she is gorge

In [22]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_data.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_data.cache().prefetch(buffer_size=AUTOTUNE)

In [23]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
      lowercase = tf.strings.lower(input_data)
      stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
      return tf.strings.regex_replace(stripped_html,
                                      '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
                    standardize=custom_standardization,
                    max_tokens=vocab_size,
                    output_mode='int',
                    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [24]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(32, activation='relu'),
  Dense(1)
])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [25]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2e49879908>

In [26]:
%load_ext tensorboard
%tensorboard --logdir logs

In [30]:
# get the trained word embeddings
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [33]:
vocab[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

In [34]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

## Test model

In [36]:
# view test data files
test_dir = os.path.join(dataset_dir, 'test')
os.listdir(test_dir)

['labeledBow.feat', 'neg', 'pos', 'urls_neg.txt', 'urls_pos.txt']

In [102]:
test_data = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test')

Found 25000 files belonging to 2 classes.


In [62]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [103]:
# sample batch from test data
for test_text_batch, test_label_batch in test_data.take(1):
    
    # view the first 5 samples
    for i in range(5):
        print(test_label_batch[i].numpy(), test_text_batch.numpy()[i])

0 b"An insult to both poker and cinema, this movie manages to make the most dynamic, brilliant, and fascinating figure in poker history into an utter bore. Still a fun film to make jokes about, from the lame gangster movie clich\xc3\xa9s of the first half to the incomprehensible nonsense of that second hour. Hilariously, Stu Ungar wins all three of his World Series titles without playing a single hand on screen. His infamous dealer abuse? 1 scene. His coke habit? 1 scene. His incredible memory? 0 scenes. They couldn't even get any real poker players. What did they cover? A lot of high angle shots from inside a house in the suburbs. Oh, and a montage of Stu waking up every day and shopping for meat which doesn't come anywhere close to making sense. Why do I care so much about this little Sopranos summer camp trying to cash in on the poker craze? Because I think there's still a great film to be made about Stu Ungar waiting for someone willing to do it right."
0 b'(SMALL SPOILERS) I just 

In [104]:
text_batch, label_batch = next(iter(test_data))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", test_data.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'This film biography of early rock and roll star Buddy Holly (1936-1959) is a tour de force for Gary Busey. The movie\'s highlights are Busey\'s stage performances where he plays guitar and sings Holly songs. He brings such energy to the performances that Holly\'s own filmed performances almost pale in comparison. Busey\'s infectious toothy grin lights up the screen, he creates a totally believable and winning personality and his Oscar nomination for best actor was well deserved.<br /><br />The film follows Holly\'s career from growing up in Lubbock, Texas, to stardom and New York and his untimely death in a plane crash. One thing I found interesting, if true, was Buddy\'s driving ambition--he had great plans to go beyond recording and performance to producing. As young as he was he was already establishing himself as a shrewd businessman and definitely wanted to take things to a higher level. We will never know if he would have ultimately catapulted his early success

In [91]:
# the vectorize function is not required to process the test data
# if the vectorize layer included in model

# test_ds = test_data.map(vectorize_text)

# # sample batch from test data
# for test_text_batch, test_label_batch in test_ds.take(1):
#     for i in range(1):
#         print(test_label_batch[i].numpy(), test_text_batch.numpy()[i])

In [106]:
loss, accuracy = model.evaluate(test_data)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.40294232964515686
Accuracy:  0.8024799823760986


In [109]:
export_model = tf.keras.Sequential([
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(test_data)
print(accuracy)

0.5178800225257874


In [None]:
text_batch, label_batch = next(iter(test_data))
first_review, first_label = text_batch[0], label_batch[0]

In [110]:
pred_label = export_model.predict(test_data)

In [111]:
pred_label

array([[0.61396766],
       [0.45945567],
       [0.31570596],
       ...,
       [0.13894525],
       [0.8771089 ],
       [0.35029382]], dtype=float32)

In [136]:
pred_label.shape

(25000, 1)

In [137]:
pred_y = []

for i in range(len(pred_label)):
    pred_y.append(round(pred_label[i][0]))

In [143]:
len(pred_y)

25000

In [144]:
actual_y = []
for tt, ll in test_data:
    for l in ll:
        actual_y.append(l.numpy())

In [148]:
correct = 0
for i in range(len(pred_y)):
    if pred_y[i] == actual_y[i]:
        correct+=1

In [150]:
correct/len(pred_y)*100

49.94

**Analyze my own review**

In [151]:
my_reviews =["The new movie is popular and awesome",
             "The background music is annoying and too loud",
             "We are very enjoy the movie",
             "Negative comment in internent is hurt people",
             "The smile is very sweat and cute!",
             "The view is so beautiful and attrative",
             ]

In [152]:
export_model.predict(my_reviews)

array([[0.61664504],
       [0.58398956],
       [0.63175166],
       [0.3898934 ],
       [0.6680643 ],
       [0.5461671 ]], dtype=float32)