<a href="https://colab.research.google.com/github/cschlicht/asg3/blob/main/1_2_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tensorflow-addons
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.test.is_gpu_available():
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

[?25l[K     |▎                               | 10 kB 11.3 MB/s eta 0:00:01[K     |▋                               | 20 kB 16.4 MB/s eta 0:00:01[K     |▉                               | 30 kB 20.4 MB/s eta 0:00:01[K     |█▏                              | 40 kB 11.5 MB/s eta 0:00:01[K     |█▌                              | 51 kB 8.4 MB/s eta 0:00:01[K     |█▊                              | 61 kB 6.2 MB/s eta 0:00:01[K     |██                              | 71 kB 7.0 MB/s eta 0:00:01[K     |██▍                             | 81 kB 6.1 MB/s eta 0:00:01[K     |██▋                             | 92 kB 6.7 MB/s eta 0:00:01[K     |███                             | 102 kB 7.1 MB/s eta 0:00:01[K     |███▎                            | 112 kB 7.1 MB/s eta 0:00:01[K     |███▌                            | 122 kB 7.1 MB/s eta 0:00:01[K     |███▉                            | 133 kB 7.1 MB/s eta 0:00:01[K     |████▏                           | 143 kB 7.1 MB/s eta 0:00:01[K 

In [2]:
tf.random.set_seed(42)

In [3]:
(X_train, y_test), (X_valid, y_test) = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [4]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [5]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


'<sos> this film was just brilliant casting location scenery story'

In [6]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteD82XTY/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteD82XTY/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteD82XTY/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
datasets.keys()

dict_keys(['test', 'train', 'unsupervised'])

In [8]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

In [9]:
train_size, test_size

(25000, 25000)

In [10]:
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [11]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [12]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [13]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [14]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [15]:
len(vocabulary)

53893

In [16]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [17]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or vocab_size)

22
12
11
10000


In [18]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [19]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [20]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

test_set = datasets["test"].repeat().batch(32).map(preprocess) 
test_set = test_set.map(encode_words).prefetch(1)

In [21]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [22]:
# #LSTM WITH 64 and 5 epochs 
# embed_size = 128
# model = keras.models.Sequential([
#     keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
#                            mask_zero=True, # not shown in the book
#                            input_shape=[None]),
#     keras.layers.LSTM(64, return_sequences=True),
#     keras.layers.LSTM(64),
#     keras.layers.Dense(1, activation="sigmoid")
# ])
# model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)

In [23]:
#LSTM WITH 64 and 20 epochs 

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.LSTM(64),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [24]:
# -------------------------------------- TEST -----------------------------------------------------

for X_batch, y_batch in datasets["test"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING ...
Label: 1 = Positive

Review: A blackly comic tale of a down-trodden priest, Nazarin showcases the economy that Luis Bunuel was able to achieve in being able to tell a deeply humanist fable with a minimum of fuss. As an output fro ...
Label: 1 = Positive



In [25]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 56), dtype=string, numpy=
 array([[b'There', b'are', b'films', b'that', b'make', b'careers', b'For',
         b'George', b'Romero', b'it', b'was', b'NIGHT', b'OF', b'THE',
         b'LIVING', b'DEAD', b'for', b'Kevin', b'Smith', b'CLERKS',
         b'for', b'Robert', b'Rodriguez', b'EL', b'MARIACHI', b'Add',
         b'to', b'that', b'list', b'Onur', b"Tukel's", b'absolutely',
         b'amazing', b'DING', b'A', b'LING', b'LESS', b'Flawless',
         b'film', b'making', b'and', b'as', b'assured', b'and', b'as',
         b'professional', b'as', b'any', b'of', b'the', b'aforementioned',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
        [b'A', b'blackly', b'comic', b'tale', b'of', b'a', b'down',
         b'trodden', b'priest', b'Nazarin', b'showcases', b'the',
         b'economy', b'that', b'Luis', b'Bunuel', b'was', b'able', b'to',
         b'achieve', b'in', b'being', b'able', b'to', b'tell', b'a',
         b'deeply', b'humanist', b'fable', b'wi

In [26]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["test"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [27]:
vocabulary.most_common()[:3]

[(b'<pad>', 218061), (b'the', 61395), (b'a', 38751)]

In [28]:
len(vocabulary)

53853

In [29]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [30]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or vocab_size)

22
11
12
10000


In [31]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [32]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    11,    12, 10053]])>

In [33]:
test_set = datasets["test"].repeat().batch(32).map(preprocess)
test_set = test_set.map(encode_words).prefetch(1)

In [34]:
for X_batch, y_batch in test_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  139    27    78 ...     0     0     0]
 [   70 10791   839 ...     0     0     0]
 [ 6154   701  7274 ...     0     0     0]
 ...
 [ 5117  5849  4557 ...     0     0     0]
 [  276     6    21 ...     0     0     0]
 [    6    99     9 ...     0     0     0]], shape=(32, 64), dtype=int64)
tf.Tensor([1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 1 1], shape=(32,), dtype=int64)


In [35]:
# #RNN WITH 64 UNITS and 5 epochs

# embed_size = 128
# model = keras.models.Sequential([
#     keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
#                            mask_zero=True, # not shown in the book
#                            input_shape=[None]),
#     keras.layers.LSTM(64, return_sequences=True),
#     keras.layers.LSTM(64),
#     keras.layers.Dense(1, activation="sigmoid")
# ])
# model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# history = model.fit(test_set, steps_per_epoch=test_size // 32, epochs=5)

In [36]:
#LSTM WITH 64 UNITS and 20 epochs

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.LSTM(64),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(test_set, steps_per_epoch=test_size // 32, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [37]:
# ------------------------------------------- 1.2 LSTM -------------------------------------------------------------

In [38]:
tf.random.set_seed(42)