# Test functions

In [None]:
import sys

sys.path.append('../pyimagesearch/')

%load_ext autoreload
%autoreload 2

## Load dataset

In [None]:
from dataset import get_imdb_dataset
import tensorflow as tf
import config

In [None]:
trainDs, valDs = get_imdb_dataset(
    folderName='../data/',
    batchSize=config.BATCH_SIZE,
    bufferSize=config.BUFFER_SIZE,
    autotune=tf.data.AUTOTUNE,
    test=False
)

In [None]:
trainDs, valDs

Retrieve the next batch of samples. Each batch is a tuple with
- `batch[0]`: tensor of samples.
- `batch[1]`: tensor of target values.

The shape of the tensors is `batch_size`, as specified to the `get_imdb_dataset` function.

In [None]:
batch = next(iter(trainDs))

In [None]:
batch[0][0].numpy()

## Test standardization function

In [None]:
from standardization import custom_standardization

In [None]:
standardized_text_batch = custom_standardization(batch[0])

standardized_text_batch[0]

## Vectorization

The `TextVectorization` layer maps the words in the sequences to sequences of integers (with a 1-to-1 correspondence between the integer values and words in the vocabulary).

In [None]:
from tensorflow.keras import layers

In [None]:
vectorizeLayer = layers.TextVectorization(
    max_tokens=config.VOCAB_ZISE,
    output_mode='int',
    output_sequence_length=config.MAX_SEQUENCE_LENGTH,
    standardize=custom_standardization,
    pad_to_max_tokens=True
)

In [None]:
vectorizeLayer.adapt(trainDs.map(lambda text, label: text))

In [None]:
vectorized_batch = vectorizeLayer(batch[0])

vectorized_batch

## Test Keras' `SimpleRNN` layer

In [None]:
from tensorflow.keras.layers import Embedding, SimpleRNN

In [None]:
embedding_layer = Embedding(
    input_dim=config.VOCAB_ZISE,
    output_dim=128,
    mask_zero=False
)

# Shape: (n_batches, max_seq_len, embedding_dim).
batch_embedding = embedding_layer(vectorized_batch)

batch_embedding

In [None]:
rnn_layer = SimpleRNN(
    units=32,
    return_sequences=True
)

# Output shape:
#   - If `return_sequences=False`: (batch_size, units) (i.e. one
#     output value per sample).
#   - If `return_sequences=True`: (batch_size, max_seq_len, units)
#     (i.e. one full sequence with an element per token for each
#     sample in the batch).
rnn_layer(batch_embedding)