# RNN

- RNN takes account the order of words

In [1]:
import tensorflow as tf

SEED_NUM = 7
tf.random.set_seed(SEED_NUM)

## Import preprocessed data
- RNN takes tokenized word vectors as input

In [2]:
import numpy as np

In [3]:
# train inputs
train_input = np.load(open("Preprocessed/train_inputs.npy", "rb"))
train_input

array([[26113,   122,     1, ..., 18720,   317,  1358],
       [  234,   206,  3051, ...,     0,     0,     0],
       [ 6053,  4960,   460, ...,   702,  1190,  5314],
       ...,
       [  119,  3108,    16, ...,     0,     0,     0],
       [  640,   518, 16618, ...,     0,     0,     0],
       [  111,     1,   350, ...,     0,     0,     0]], dtype=int32)

In [4]:
# train labels
train_label = np.load(open("Preprocessed/train_labels.npy", "rb"))
train_label

array([1, 1, 0, ..., 0, 0, 1])

In [5]:
# word dictionary
prepo_configs = json.load(open("Preprocessed/data_configs.json", "r"))

## Hyperparameters

In [6]:
# Hyperparameters for model training
model_name = "rnn_classifier_english"
BATCH_SIZE = 128
NUM_EPOCHS = 5
VALID_SPLIT = 0.2


# Hyperparameters for model layers
kwargs = {"model_name": model_name,
        "vocab_size": prepo_configs["vocab_size"],
        "embedding_dimension": 100,
        "dropout_rate": 0.2,
        "lstm_dimension": 150,
        "dense_dimension": 150,
        "output_dimension": 1}

## Define Model Structure

- Define structure of RNN by class method.

In [7]:
class RNNClassifier(tf.keras.Model):
    
    # __init__ method
    def __init__(self, **kwargs):
        super(RNNClassifier, self).__init__(name = kwargs["model_name"])  # super() is used to inherit parent's variables 
        self.embedding = tf.keras.layers.Embedding(input_dim = kwargs["vocab_size"], 
                                          output_dim = kwargs["embedding_dimension"])
        self.lstm_1_layer = tf.keras.layers.LSTM(kwargs["lstm_dimension"], return_sequences = True)
        self.lstm_2_layer = tf.keras.layers.LSTM(kwargs["lstm_dimension"])
        self.dropout = tf.keras.layers.Dropout(kwargs["dropout_rate"])
        self.fc1 = tf.keras.layers.Dense(units = kwargs["dense_dimension"], 
                                         activation = tf.keras.activations.tanh)
        self.fc2 = tf.keras.layers.Dense(units = kwargs["output_dimension"], 
                                         activation = tf.keras.activations.sigmoid)
     
    
    # call method to run layers
    def call(self, x):
        x = self.embedding(x)      # word embedding layer
        x = self.dropout(x)        # dropout layer to prevent overfitting
        x = self.lstm_1_layer(x)   # pass first LSTM layer
        x = self.lstm_2_layer(x)   # pass second LSTM lsyer
        x = self.dropout(x)        # another dropout layer
        x = self.fc1(x)            # fully connected layer with tanh activation
        x = self.fc2(x)            # output layer (binary classification)
        
        return x

## Model Compiling

- We are going to use Adam optimizer.

In [8]:
# Create rnn model
rnn = RNNClassifier(**kwargs)  

# Comile model
rnn.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001),  # optimizer
           loss = tf.keras.losses.BinaryCrossentropy(),                    # loss function
           metrics = [tf.keras.metrics.BinaryAccuracy(name="accuracy")])   # evaluation metric

## Model Training

---
**EarlyStopping** is used to prevent overfitting.
- `monitor`: validating score
- `min_delta`: the threshold that triggers the termination (acc should at least improve by the set value)
- `patience`: no improvement epochs (session gets terminated if acc doesn't improve during the set epochs)

--- 
**ModelCheckpoint** saves model for each epochs.
- `save_best_only`: only save the best performing model
- `save_weights_only`: only save model weights instead of entire model graph

In [None]:
import os

# EarlyStopping
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy", min_delta = 0.0001, patience = 2)


# ModelCheckpoint
SAVE_PATH = "Trained_model/"
checkpoint_path = SAVE_PATH + model_name + "/weight.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok = True)
    print("{} -- Folder newly created \n".format(checkpoint_dir))
    
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor="val_accuracy", 
                                   verbose=1, save_best_only=True, save_weights_only=True)


# Train the model
history = rnn.fit(train_input, train_label, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
                 validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

- Since it takes a while to train rnn, I used GPU from Google Colab.
- We will skip the training process and load the trained weights

In [9]:
rnn.evaluate(train_input, train_label)   # Initialize RNN
rnn.load_weights("IMDB_rnn.h5")



## Model Evaluation

In [10]:
loss, acc = rnn.evaluate(train_input, train_label, verbose=2)
print("Accuracy of recovered model: {:5.2f}%".format(100*acc))

782/782 - 35s - loss: 0.1873 - accuracy: 0.9328
Accuracy of recovered model: 93.28%


# CNN

- CNN takes account the regional information of sentences
- We are using 3 Conv1D filters (size of 3, 4, 5 respectively) and Maxpooling for each to extract representative features of sentence
- `MaxNorm` argument indicates a regularization on the magnitude of weight vectors to prevent exploding gradient problem. Typical values are 3 and 4

## Hyperparameters

In [11]:
model_name = "cnn_classifier_en"
BATCH_SIZE = 128
NUM_EPOCHS = 10
VALID_SPLIT = 0.1
MAX_LEN = train_input.shape[1]

kwargs = {
    "model_name": model_name,
    "vocab_size": prepo_configs["vocab_size"],
    "embedding_size": 128,
    "num_filters": 100,
    "dropout_rate": 0.5,
    "hidden_dimension": 250,
    "output_dimension": 1
}

## Define Model Structure
- Similar with RNN modeling

In [12]:
class CNNClassifier(tf.keras.Model):
    
    def __init__(self, **kwargs):
        super(CNNClassifier, self).__init__(name=kwargs["model_name"])
        self.embedding = tf.keras.layers.Embedding(input_dim=kwargs["vocab_size"],
                                                  output_dim=kwargs["embedding_size"])
        self.conv_list = [tf.keras.layers.Conv1D(filters=kwargs["num_filters"],
                                                kernel_size=kernel_size,
                                                padding="valid",
                                                activation=tf.keras.activations.swish,
                                                kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
                         for kernel_size in [3,4,5]]
        self.pooling = tf.keras.layers.GlobalMaxPooling1D()
        self.dropout = tf.keras.layers.Dropout(kwargs["dropout_rate"])
        self.fc1 = tf.keras.layers.Dense(units=kwargs["hidden_dimension"],
                                        activation=tf.keras.activations.swish,
                                        kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
        self.fc2 = tf.keras.layers.Dense(units=kwargs["output_dimension"],
                                        activation=tf.keras.activations.sigmoid,
                                        kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
        
        
    def call(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis=1)
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x
    

## Model Compiling

In [13]:
cnn = CNNClassifier(**kwargs)
cnn.compile(optimizer=tf.keras.optimizers.Adam(0.0001),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy")])

## Model Training
- We use EarlyStopping and Model Checkpoint technic as same as we did for RNN

In [None]:
import os

# EarlyStopping
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy", min_delta = 0.0001, patience = 2)


# ModelCheckpoint
SAVE_PATH = "Trained_model/"
checkpoint_path = SAVE_PATH + model_name + "/weight.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok = True)
    print("{} -- Folder newly created \n".format(checkpoint_dir))
    
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, monitor="val_accuracy", verbose=1, save_best_only=True, save_weights_only=False
)


# Train the model
history = cnn.fit(train_input, train_label, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
                 validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

- Same as RNN, training was done in Google Colab.

In [14]:
cnn.evaluate(train_input, train_label)   # Initialize CNN
cnn.load_weights("IMDB_cnn.h5")



## Model Evaluation

In [15]:
loss, acc = cnn.evaluate(train_input, train_label, verbose=2)
print("Accuracy of recovered model: {:5.2f}%".format(100*acc))

782/782 - 6s - loss: 0.0773 - accuracy: 0.9869
Accuracy of recovered model: 98.69%


# Conclusion

- It certainly seems like DL models perform better than ML models for movie review dataset, but keep it mind that it's not always.
- Roughly speaking, it is said that DL models perform better as the size of data is sufficiently large.