In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
shutil.rmtree('/content/aclImdb/train/unsup')

In [None]:
batch_size = 32
seed = 218

train_dataset = preprocessing.text_dataset_from_directory(
    '/content/aclImdb/train', 
    label_mode="binary", 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed
  )

val_dataset = preprocessing.text_dataset_from_directory(
    '/content/aclImdb/train', 
    label_mode="binary", 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed
  )



Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
# def custom_standardization(input_data):
#   lowercase = tf.strings.lower(input_data)
#   stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
#   return tf.strings.regex_replace(stripped_html,
#                                   '[%s]' % re.escape(string.punctuation),
#                                   '')

vectorize_layer = TextVectorization(
    max_tokens=10000,  
)


train_text = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
  return vectorize_layer(text), label

In [None]:
train_dataset = train_dataset.map(vectorize_text)
val_dataset = val_dataset.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
vocab_len = len(vectorize_layer.get_vocabulary())+1
model = tf.keras.Sequential([
  layers.Embedding(vocab_len, 16),
  layers.Conv1D(filters=64, kernel_size=5,
                strides=1, padding="causal",
                activation="relu"),
  layers.Bidirectional(layers.GRU(64)),
  layers.Dense(30, activation="relu"),
  layers.Dense(1, activation="sigmoid")
])

# model = tf.keras.Sequential([
#   layers.Embedding(vocab_len, 16, mask_zero=True),
#   layers.Bidirectional(layers.LSTM(64)),
#   layers.Dropout(0.2),
#   layers.Dense(1, activation="sigmoid")])

model.compile(loss="binary_crossentropy",
              optimizer='adam',
              metrics=["accuracy"])

epochs = 10
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs)

In [16]:
test_dataset = preprocessing.text_dataset_from_directory(
    '/content/aclImdb/test', 
    label_mode="binary", 
    batch_size=batch_size, 
    seed=seed
  )

Found 25000 files belonging to 2 classes.


In [20]:
prod_model = tf.keras.Sequential([
    vectorize_layer,
    model
])

prod_model.compile(loss="binary_crossentropy",
              optimizer='adam',
              metrics=["accuracy"])

In [21]:
prod_model.evaluate(test_dataset)



[0.7234600782394409, 0.8509200215339661]

In [24]:
prod_model.predict(["i love this movie"])

array([[0.9807505]], dtype=float32)