In [47]:
import tensorflow as tf
import numpy as np

import os

# Text Preprocessing
import re
import string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers

In [70]:
# create training and validation data
batch_size = 32
main_path = 'D:/keras_datasets/aclImdb'
train_path = os.path.join(main_path, 'train')
test_path = os.path.join(main_path, 'test')

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_path,
    batch_size = batch_size,
    validation_split = .2,
    subset = 'training',
    seed = 1337   # always set a seed for reproducibility
)

raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_path,
    batch_size = batch_size,
    validation_split = 0.2,
    subset = 'validation',
    seed = 1337 # always set a seed for reproducibility
)

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_path, batch_size = batch_size
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


### Preprocessing the data

In [72]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data) # built-in tf function to operate on tesors
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, "[%s]" % re.escape(string.punctuation), "")

max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorization = TextVectorization(
    standardize = custom_standardization,
    max_tokens = max_features,
    output_mode = 'int',
    output_sequence_length = sequence_length
)

text_ds = raw_train_ds.map(lambda x, y : x)
vectorization.adapt(text_ds)

# to use the vectorization layer, the function adapt can be called
# for example : vectorization.adapt(text_input)

In [73]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorization(text), label

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

## Build the model

In [78]:
text_input = tf.keras.Input(shape = (None,), dtype = 'int64')
x = layers.Embedding(max_features, embedding_dim)(text_input)
x = layers.Dropout(0.5)(x)

x = layers.Conv1D(128, 7, padding = 'valid', activation = 'relu', strides = 3)(x)
x = layers.Conv1D(128, 7, padding = 'valid', activation = 'relu', strides = 3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128, activation = 'relu')(x)
x = layers.Dropout(0.5)(x)
predictions = layers.Dense(1, activation = 'sigmoid', name='predictions')(x)

model = tf.keras.Model(text_input, predictions)

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [77]:
tf.keras.backend.clear_session()

In [79]:
epochs = 3

model.fit(train_ds, validation_data = val_ds, epochs = epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1f6f60a97c0>