In [2]:
# Import packages
# DL Packages
import tensorflow as tf
import keras

# Others
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import sympy as sym
import seaborn as sns

from sklearn.metrics import confusion_matrix

# Examine the Data:

I'll use the suggested IMDB movie review sentiment classification [dataset](https://keras.io/api/datasets/imdb/). This dataset has 25,000 movie reviews that are rated as either positive or negative. The data is pre-processed so that words have been replaced with positive integers that correspond to the frequency of a word. For example the 19th most common word will be represented as 19 + an offset we specify.

In [46]:
# Reading instructions from https://keras.io/api/datasets/imdb/
# With extra parameters specified

# Start of a sequence is marked as this
start_char = 1

# Words that are skipped because they are too infrequent are replaced by this
oov_char = 2

# Actual words have this index or higher
index_from = 3

# Skip the top n most common words
skip_top = 0

# Only the num_words most frequent words are kept.
# Anything less frequent is treated as oov
num_words = 10000

# Maximum length of the review. Truncate past this point
maxlen = None

# Random seed to make loading deterministic
seed = 17

# Retrieve the training sequences.
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(
    start_char=start_char, oov_char=oov_char, index_from=index_from, seed=seed,
    skip_top=skip_top, num_words=None, maxlen=maxlen
)
# Retrieve the word index file mapping words to indices
word_index = keras.datasets.imdb.get_word_index()

# Reverse the word index to obtain a dict mapping indices to words
# And add `index_from` to indices to sync with `x_train`
inverted_word_index = dict(
    (i + index_from, word) for (word, i) in word_index.items()
)

# Update `inverted_word_index` to include `start_char` and `oov_char`
inverted_word_index[start_char] = "[START]"
inverted_word_index[oov_char] = "[OOV]"

# Decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])

print("x[0]", decoded_sequence)
print("y[0]", y_train[0])

x[0] [START] rudy does it again with this hot off the streets follow up to dolemite this entry is filled with the requisite rudy ray moore raunch humor and martial arts rudy eludes a crazy red neck sheriff in this movie that also features an infamous scene where rudy dives down a steep hill see it for laughs and for a brain blasting hit of blaxploitation magic
y[0] 1


In [47]:
np.unique(y_train)

array([0, 1])

This code looks like it gets the review text, and from this example it seems that y=1 is positive sentiment and y=0 is negative sentiment. Now let's make a function to load the data. For feeding it to our network we want to keep it in the integer encoding, but to examine the data we want the actual text.

In [48]:
def load_review(x_data: np.array, y_data: np.array, i: int, load_txt: bool = False, inverted_word_index:dict = inverted_word_index):
    """
    Loads the review at the specified index from the specified x, y data.

    Optionally converts to text.

    Args:
        x_data (np.array): dataset to load from (either x_train or x_test)
        y_data (np.array): labels to load from (either y_train or y_test)
        i (int): index of review to load
        load_txt (bool, optional): whether to convert integer encoding to text or not. Defaults to False.
        inverted_word_index (dict, optional): mapping of indices to words. Defaults to inverted_word_index.

    Returns:
        review_data, sentiment_label
    """

    review_data = x_data[i]

    if load_txt:
        review_data = " ".join(inverted_word_index[n] for n in x_train[i])

    return review_data, y_data[i]

Armed with this function, let's randomly select some positive and negative reviews to look at:

In [49]:
pos_idx = np.argwhere(y_train == 1).flatten()
neg_idx = np.argwhere(y_train == 0).flatten()

In [50]:
n_samples = 5

print("Positive Reviews...")
for n in range(n_samples):
    i = np.random.choice(pos_idx)
    print(load_review(x_train, y_train, i, load_txt=True))

print("\nNegative Reviews...")
for n in range(n_samples):
    i = np.random.choice(neg_idx)
    print(load_review(x_train, y_train, i, load_txt=True))


Positive Reviews...
("[START] frailty is a non gory horror film that achieves its chills by following the logic and impact of a man's delusion obsession straight into depravity dad we never learn his name is a gentle man and loving father who's raising his sons alone after mom died giving birth to the youngest son adam the family's world flips upside down late one night when dad rushes into the boys' room and tells them god has given him a vision and what a vision \x96 the entire family's job is to destroy demons who of course are disguised in human form br br proceeding from this premise the movie is unflinching in following it dad kidnaps people demons whom god has told him to destroy binds them lays his hand on them to see a vision of their evil then kills them \x96 while making his young sons watch fenton the older boy is horrified seeing only a father who's turned into a crazed murderer adam the younger is uncomfortable but trusts that dad is following god's will eventually dad ta

# Pre-Process Data:

For the assignment, I want to try two different approaches: the default frequency-based embeddings of the dataset, and using a GLOVE embedding to see if we're able to pick out more information. We're already good to go for the first one, but we need to do some word for the second one.

## Create GLOVE embedding

Followed the instructions from: https://keras.io/examples/nlp/pretrained_word_embeddings/

In [35]:
pos_idx

array([    0,     4,     5, ..., 24996, 24998, 24999])

# Make/Train a Network

## RNN

In [59]:
# 10000 most common words, plus the out of dictionary
model = keras.Sequential([
    keras.layers.Embedding(input_dim=10000, output_dim=16),
    keras.layers.SimpleRNN(64, return_sequences=True, activation="relu"),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(2, activation="sigmoid")
    ], name="Simple RNN")
model.build()
model.compile(optimizer="adam", loss="binary_crossentropy", 
              metrics=[keras.metrics.CategoricalAccuracy()])
model.summary()

Model: "Simple RNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, None, 16)          160000    
                                                                 
 simple_rnn_8 (SimpleRNN)    (None, None, 64)          5184      
                                                                 
 dense_16 (Dense)            (None, None, 16)          1040      
                                                                 
 dense_17 (Dense)            (None, None, 2)           34        
                                                                 
Total params: 166258 (649.45 KB)
Trainable params: 166258 (649.45 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## LSTM