In [0]:
# Models
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import GlobalMaxPool1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

# Dataset
import tensorflow_datasets as tfds

# Others
from statistics import mean
import json
import glob
import random
import numpy as np
import matplotlib.pyplot as plt
from timeit import default_timer as timer

In [0]:
vocab_size = 20000
train_size = 14000
valid_size = 2000
test_size = 12500
oov_token = "<OOV>"
trunc_type = "post"
padding = "pre"
max_len = 512
embedding_dim = 50

batch_size = 32
epochs = 50
patience = 10
callbacks = [
    EarlyStopping(
        monitor="val_accuracy",
        patience=patience,
        restore_best_weights=False,
    )
]

In [0]:
imdb = tfds.load("imdb_reviews", as_supervised=True, shuffle_files=True)
train_dataset = imdb["train"]
test_dataset = imdb["test"]

train_text = []
train_labels = []
test_text = []
test_labels = []

# Put sentences and labels in lists
for s, l in train_dataset:
    train_text.append(str(s.numpy()))
    train_labels.append(l.numpy())

for s, l in test_dataset:
    test_text.append(str(s.numpy()))
    test_labels.append(l.numpy())


# Convert them into numpy arrays
train_text, train_labels, test_text, test_labels = (
    np.array(train_text),
    np.array(train_labels),
    np.array(test_text),
    np.array(test_labels),
)

# Shuffle the train/test set
train_rand = np.arange(len(train_text))
np.random.seed(42)
np.random.shuffle(train_rand)
train_text = train_text[train_rand]
train_labels = train_labels[train_rand]

test_rand = np.arange(len(test_text))
np.random.seed(42)
np.random.shuffle(test_rand)
test_text = test_text[test_rand]
test_labels = test_labels[test_rand]


# Take the subset of the data
train_reviews, valid_reviews = (
    train_text[: train_size],
    train_text[-valid_size :],
)

train_sentiments, valid_sentiments = (
    np.array(train_labels[: train_size]),
    np.array(train_labels[-valid_size :]),
)

test_reviews = test_text[: test_size]
test_sentiments = np.array(test_labels[: test_size])

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_reviews)

train_seq = tokenizer.texts_to_sequences(train_reviews)
train_padded = pad_sequences(
    train_seq,
    maxlen=max_len,
    truncating=trunc_type,
    padding=padding,
)
valid_seq = tokenizer.texts_to_sequences(valid_reviews)
valid_padded = pad_sequences(
    valid_seq,
    maxlen=max_len,
    truncating=trunc_type,
    padding=padding,
)
test_seq = tokenizer.texts_to_sequences(test_reviews)
test_padded = pad_sequences(
    test_seq,
    maxlen=max_len,
    truncating=trunc_type,
    padding=padding,
)

In [4]:
train_reviews[:5]

array(['b\'I watched "Elephant Walk" for the first time in about 30 years and was struck by how similar the story line is to the greatly superior "Rebecca." As others have said, you have the sweet young thing swept off her feet by the alternately charming and brooding lord of the manor, only to find her marriage threatened by the inescapable memory of a larger-than-life yet deeply flawed relative. You have the stern and disapproving servant, a crisis that will either bind the couple together or tear them irreparably apart, climaxed by the fiery destruction of the lavish homestead.<br /><br />Meanwhile, "Elephant Walk" also owes some of its creepy jungle atmosphere to "The Letter," the Bette Davis love triangle set on a Singapore rubber plantation rather than a Sri Lankan tea plantation.<br /><br />Maltin gives "Elephant Walk" just two stars, and IMDb readers aren\\\'t much kinder, but I enjoyed it despite its predictability. Elizabeth Taylor never looked lovelier, and Peter Finch does 

In [0]:
def build_model(lstm_hidden_size, dense_hidden_size, dropout_rate):
    model = Sequential()
    model.add(
        Embedding(vocab_size, embedding_dim, input_length=max_len)
    )
    model.add(Bidirectional(LSTM(lstm_hidden_size, return_sequences=True)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(dense_hidden_size, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(
        loss="binary_crossentropy",
        optimizer=Adam(learning_rate=1e-3),
        metrics=["accuracy"],
    )
    return model

In [6]:
model = build_model(16, 64, 0.4)
model.summary()
history = model.fit(
    train_padded,
    train_sentiments,
    validation_data=(valid_padded, valid_sentiments),
    batch_size=batch_size,
    epochs=epochs,
    shuffle=True,
    callbacks=callbacks,
    verbose=1
)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 50)           1000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 512, 32)           8576      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 64)                2112      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,010,753
Trainable params: 1,010,753
Non-trainable params: 0
______________________________________________

In [7]:
hist = {"train_acc_converge": history.history["accuracy"][-11],
        "train_acc_final": history.history["accuracy"][-1],
        "valid_acc_converge": history.history["val_accuracy"][-11],
        "valid_acc_final": history.history["val_accuracy"][-1],
        "max_valid_acc": max(history.history["val_accuracy"]),}

print(json.dumps(hist, indent=4))

{
    "train_acc_converge": 0.9705714583396912,
    "train_acc_final": 0.9993571639060974,
    "valid_acc_converge": 0.8840000033378601,
    "valid_acc_final": 0.8694999814033508,
    "max_valid_acc": 0.8840000033378601
}


In [0]:
oracle = model

In [9]:
train_reviews[:5]

array(['b\'I watched "Elephant Walk" for the first time in about 30 years and was struck by how similar the story line is to the greatly superior "Rebecca." As others have said, you have the sweet young thing swept off her feet by the alternately charming and brooding lord of the manor, only to find her marriage threatened by the inescapable memory of a larger-than-life yet deeply flawed relative. You have the stern and disapproving servant, a crisis that will either bind the couple together or tear them irreparably apart, climaxed by the fiery destruction of the lavish homestead.<br /><br />Meanwhile, "Elephant Walk" also owes some of its creepy jungle atmosphere to "The Letter," the Bette Davis love triangle set on a Singapore rubber plantation rather than a Sri Lankan tea plantation.<br /><br />Maltin gives "Elephant Walk" just two stars, and IMDb readers aren\\\'t much kinder, but I enjoyed it despite its predictability. Elizabeth Taylor never looked lovelier, and Peter Finch does 

In [0]:
import pandas as pd

train_sentences = list(train_reviews)
oracle_train_labels = [i[0] for i in oracle.predict(train_padded)]
oracle_train_labels = [1 if i >= 0.5 else 0 for i in oracle_train_labels]

df = pd.DataFrame({"reviews": train_sentences, "oracle_labels": oracle_train_labels})
df.to_csv("oracle.csv", index=False)

In [12]:
df = pd.read_csv("oracle.csv")
df.head()

Unnamed: 0,reviews,oracle_labels
0,"b'I watched ""Elephant Walk"" for the first time...",1
1,"b""I would put this at the top of my list of fi...",0
2,"b'Police, investigations, murder, suspicion: w...",1
3,b'I read Schneebaum\'s book (same title as thi...,1
4,"b""Well, you'd better if you plan on sitting th...",0


In [0]:
end = timer()
print((end - start) / 3600)