Imports

In [28]:
import tensorflow as tf
import keras
from keras.datasets import imdb
from keras import models
from keras import layers

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

Loading Data (IMDB)

In [2]:
input_num = 5000
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = input_num, skip_top = 10, oov_char = 2)

Viewing Data

In [3]:
print("train_data[0]:", train_data[0])
print("shape: ", train_labels.shape)
print ("max: ", max([max(sequence) for sequence in train_data]))

train_data[0]: [2, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 2, 173, 36, 256, 2, 25, 100, 43, 838, 112, 50, 670, 2, 2, 35, 480, 284, 2, 150, 2, 172, 112, 167, 2, 336, 385, 39, 2, 172, 4536, 1111, 17, 546, 38, 13, 447, 2, 192, 50, 16, 2, 147, 2025, 19, 14, 22, 2, 1920, 4613, 469, 2, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 2, 22, 17, 515, 17, 12, 16, 626, 18, 2, 2, 62, 386, 12, 2, 316, 2, 106, 2, 2, 2223, 2, 16, 480, 66, 3785, 33, 2, 130, 12, 16, 38, 619, 2, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 2, 22, 12, 215, 28, 77, 52, 2, 14, 407, 16, 82, 2, 2, 2, 107, 117, 2, 15, 256, 2, 2, 2, 3766, 2, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 2, 2, 2, 1029, 13, 104, 88, 2, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 2, 194, 2, 18, 2, 226, 22, 21, 134, 476, 26, 480, 2, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 2, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 2, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]
shape:  (25000,)
max:  4999


Checking and Reversing Indicies

In [4]:
# mapping words to an integer index
word_index = imdb.get_word_index()

# Reversing index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
reverse_word_index[2]

'and'

Checking Encoding and Decoding

In [5]:
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
"decoded review:", decoded_review

('decoded review:',
 "? this film was just brilliant casting location scenery story direction everyone's really suited ? part they played ? you could just imagine being there robert ? ? an amazing actor ? now ? same being director ? father came from ? same scottish island as myself so i loved ? fact there was ? real connection with this film ? witty remarks throughout ? film were great it was just brilliant so much that i bought ? film as soon as it was released for ? ? would recommend it ? everyone ? watch ? ? fly ? was amazing really cried at ? end it was so sad ? you know what they say if you cry at ? film it must have been good ? this definitely was also ? ? ? two little ? that played ? ? ? norman ? paul they were just brilliant children are often left out ? ? ? list i think because ? stars that play them all grown up are such ? big ? for ? whole film but these children are amazing ? should be ? for what they have done don't you think ? whole story was so lovely because it was true

Reshaping/Engineering Input Data

In [6]:
# Finding whether or not words are within a review and adding that to input data
x_train = np.zeros((len(train_data), input_num))
for i, train_date in enumerate(train_data):
    x_train[i, train_date] = 1

x_test = np.zeros((len(test_data), input_num))
for i, train_date in enumerate(test_data):
    x_train[i, train_date] = 1

x_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

Splitting Training Set

In [7]:
x_val = x_train[:5000]
partial_x_train = x_train[5000:]

y_val = train_labels[:5000]
partial_y_train = train_labels[5000:]
y_test = test_labels

Initalizing Model

In [11]:
model = models.Sequential()

# Two linear layers
model.add(layers.Dense(16, activation = "linear"))
model.add(layers.Dense(8))

# Relu layer, neg inputs -> 0
model.add(layers.Dense(2, activation = "relu"))

# Sigmoid layer, turns data into 0 or 1
model.add(layers.Dense(1, activation = "sigmoid"))

# Optimizer of sgd, binary_crossentropy for binary categorical function
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

Training Model

In [12]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    
                    # More increases accuracy but can lead to overfitting
                    epochs=5,
                    
                    # Less increases runtime and accuracy
                    batch_size=25,
                    verbose=1,
                    validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Checking Model Performance

In [13]:
history.history

{'loss': [0.6527772545814514,
  0.5783050656318665,
  0.540793776512146,
  0.5161452293395996,
  0.49730855226516724],
 'accuracy': [0.6254000067710876,
  0.7135999798774719,
  0.7358999848365784,
  0.754800021648407,
  0.7671999931335449],
 'val_loss': [0.605255126953125,
  0.5772414803504944,
  0.567742109298706,
  0.5689533948898315,
  0.5708919763565063],
 'val_accuracy': [0.6958000063896179,
  0.7106000185012817,
  0.7121999859809875,
  0.7149999737739563,
  0.7121999859809875]}

In [15]:
results = model.evaluate(partial_x_train, partial_y_train)
print ("train:", results)
results = model.evaluate(x_val, y_val)
print ("validation:", results)
results = model.evaluate(x_test, y_test)
print ("all data", results)

history_dict = history.history
print("history dict.keys():", history_dict.keys())

train: [0.46137142181396484, 0.7935500144958496]
validation: [0.5708920359611511, 0.7121999859809875]
all data [0.7700405716896057, 0.5]
history dict.keys(): dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


Plotting Model Results

In [16]:
loss = history.history['loss']
val_loss = history.history['val_loss']

accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

epochs = [i for i in range(1, len(loss) + 1)]

fig = make_subplots(rows = 2, cols = 1)
fig.add_trace(go.Scatter(x = epochs, y = loss, mode = "markers+lines", name = "Training Loss"), row = 1, col = 1)
fig.add_trace(go.Scatter(x = epochs, y = val_loss, mode = "markers+lines", name = "Validation Loss"), row = 1, col = 1)

fig.update_xaxes(title = "Epochs")
fig.update_yaxes(title = "Loss")

fig.add_trace(go.Scatter(x = epochs, y = accuracy, mode = "markers+lines", name = "Training Accuracy"), row = 2, col = 1)
fig.add_trace(go.Scatter(x = epochs, y = val_accuracy, mode = "markers+lines", name = "Validation Accuracy"), row = 2, col = 1)

fig.update_xaxes(title = "Epochs")
fig['layout']['yaxis2'].update(title_text='Accuracy')

fig.update_layout(title = "Training and Validation Loss", height = 600, width = 1000)
fig.show()

Using model.predict and Numpy to Determine Accuracy

In [18]:
y_pred = model.predict(x_test)
y_pred

sum(abs((np.rint(y_pred).flatten()) - y_test))/len(y_test)



array([[0.3112249],
       [0.3112249],
       [0.3112249],
       ...,
       [0.3112249],
       [0.3112249],
       [0.3112249]], dtype=float32)