# Train a Neural Network model for sentiment analysis with imdb dataset and keras

## Load properties

In [None]:
import yaml

with open("../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

with open("../config/model/basic.yaml", "r") as f:
    model_config = yaml.safe_load(f)

## Load the dataset

In [None]:
from keras.datasets import imdb
import numpy as np

num_words = config["num_words"]

(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(
    num_words=num_words
)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

## Analyze the dataset

In [None]:
print(f"The output categories are {np.unique(targets)}")
print(f"The number of unique words is {len(np.unique(np.hstack(data)))}")

Visualize mean and sd

In [None]:
import matplotlib.pyplot as plt

length = [len(i) for i in data]
mean = np.mean(length)
std = np.std(length)
print(f"The Average Review length is {mean}")
print(f"The Standard Deviation is {std}")

# create histogram
plt.hist(length, bins=10, alpha=0.5, color="g")

# add a vertical line for the mean
plt.axvline(mean, color="blue", linestyle="solid", linewidth=2)

# add vertical lines for one standard deviation above and below the mean
plt.axvline(x=(mean + std), color="red", linestyle="dashed", linewidth=2)
plt.axvline(x=(mean - std), color="red", linestyle="dashed", linewidth=2)

plt.title("Histogram with Mean and Standard Deviation")
plt.show()

todo

In [None]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join([reverse_index.get(i - 3, "#") for i in data[0]])
print(decoded)

## Prepare data for training

In [None]:
def vectorize(sequences: np.ndarray, dimension=num_words) -> np.ndarray:
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results


data = vectorize(data)
targets = np.array(targets).astype("float32")

Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split

test_size = model_config["model"]["training"]["test_size"]
random_state = model_config["model"]["training"]["random_state"]

train_x, test_x, train_Y, test_Y = train_test_split(data, targets, test_size=test_size, random_state=random_state)

## Build and traing the neuronal network

Create sequential model

In [None]:
from keras import models

model = models.Sequential()

In [None]:
from keras import layers

activation = model_config["model"]["training"]["activation_function"]

# Input - Layer
model.add(layers.Dense(units=50, activation=activation, input_shape=(num_words,)))
# Hidden - Layers
model.add(layers.Dropout(rate=0.3, noise_shape=None, seed=None))
model.add(layers.Dense(units=50, activation=activation))
model.add(layers.Dropout(rate=0.2, noise_shape=None, seed=None))
model.add(layers.Dense(units=50, activation=activation))
# Output- Layer
model.add(layers.Dense(units=1, activation="sigmoid"))
model.summary()

Compile the model

In [None]:
optimizer = model_config["model"]["compile"]["optimizer"]
loss = model_config["model"]["compile"]["loss_function"]
metrics = model_config["model"]["compile"]["metrics"]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

Train the model

In [None]:
epochs = model_config["model"]["training"]["epochs"]
batch_size = model_config["model"]["training"]["batch_size"]

results = model.fit(
    train_x, train_Y, epochs=epochs, batch_size=batch_size, validation_data=(test_x, test_Y)
)

## Evalute the model

In [None]:
scores = model.evaluate(x=test_x, y=test_Y, verbose=0)
print(f"Accuracy: {scores[1]*100:.2f}%")

## persist the model

In [None]:
import os

filepath = os.path.join("..", config["model_path"], model_config["model"]["name"])
model.save(filepath=filepath, overwrite=True)