# Text Classification from Scratch
Reference https://keras.io/examples/nlp/text_classification_from_scratch/

In [None]:
import os
import re
import string
import typing

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import tensorflow as tf
import tensorflow.security.fuzzing.py.annotation_types as _atypes
import numpy as np

## Fetch Data

Data will come from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In the shell:
```
$ mkdir /tmp/text
$ pushd /tmp/text
$ wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
$ tar -zxf aclImdb_v1.tar.gz
$ ls aclImdb
README  imdb.vocab  imdbEr.txt  test  train
$ ls aclImdb/train
labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
$ ls aclImdb/test
labeledBow.feat  neg  pos  urls_neg.txt  urls_pos.txt
```

Text files are in folders for positive & negative reviews.
* positive `aclImdb/train/pos`
* negative `aclImdb/train/neg`

Use [`keras.utils.text_dataset_from_directory`](https://keras.io/api/data_loading/text/) to generate a `tf.data.Dataset` object from texts in classified folders.  Only supports `.txt` files.

In [None]:
DATAROOT = "/tmp/text/aclImdb"  # the data extraction dir of the above shell commands.
BATCH_SIZE = 32
raw_train_ds = keras.utils.text_dataset_from_directory(
    os.path.join(DATAROOT, "train"),
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = keras.utils.text_dataset_from_directory(
    os.path.join(DATAROOT, "train"),
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_ds = keras.utils.text_dataset_from_directory(
    os.path.join(DATAROOT, "test"),
    batch_size=BATCH_SIZE
)

In [None]:
print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Data sample preview; the [`tf.data.Dataset.take(count)`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#take) method creates a `Dataset` with at most `count` elements from this dataset.

The `raw_train` and `raw_val` datasets when iterated, yield (text, label) pairs.

In [None]:
# It's important to take a look at your raw data to ensure your normalization
# and tokenization will work as expected. We can do that by taking a few
# examples from the training set and looking at them.
# This is one of the places where eager execution shines:
# we can just evaluate these tensors using .numpy()
# instead of needing to evaluate them in a Session/Graph context.
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

## Data Preparation

Remove `<br/>` tags.

In [None]:
def custom_standardization(input_data: typing.Annotated[typing.Any, _atypes.String]) -> tf.Tensor:
    """
    Having looked at our data above, we see that the raw text contains HTML break
    tags of the form '<br />'. These tags will not be removed by the default
    standardizer (which does not strip HTML). Because of this, we will need to
    create a custom standardization function.
    """
    lowercase = tf.strings.lower(input_data)  # https://www.tensorflow.org/api_docs/python/tf/strings/lower
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"[{re.escape(string.punctuation)}]", "")

In [None]:
# Model constants
MAX_FEATURES = 20000
EMBEDDING_DIM = 128
SEQUENCE_LENGTH = 500

In [None]:
# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=MAX_FEATURES,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
# Now that the vectorize_layer has been created, call `adapt` on a text-only
# dataset to create the vocabulary. You don't have to batch, but for very large
# datasets this means you're not keeping spare copies of the dataset in memory.

# Let's make a text-only dataset (no labels):
text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

## Two Ways to Vectorise Data

1. **Part of the model** - the model processes raw strings.
   ```
   text_input = keras.Input(shape=(1,), dtype=tf.string, name='text')
   x = vectorize_layer(text_input)
   x = layers.Embedding(max_features + 1, embedding_dim)(x)
   ```

2. **Apply Vectoriser to the Dataset** -- generates dataset of word indices, and model takes integer sequences.
   Enables asynchronous CPU processing and buffering when training on a GPU.
   ```
   def vectorize_text(text, label):
       text = tf.expand_dims(text, -1)
       return vectorize_layer(text), label

   # Vectorize the data.
   train_ds = raw_train_ds.map(vectorize_text)
   val_ds = raw_val_ds.map(vectorize_text)
   test_ds = raw_test_ds.map(vectorize_text)

   # Do async prefetching / buffering of the data for best performance on GPU.
   train_ds = train_ds.cache().prefetch(buffer_size=10)
   val_ds = val_ds.cache().prefetch(buffer_size=10)
   test_ds = test_ds.cache().prefetch(buffer_size=10)
   ```

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

## Model Definition
Using a 1-D convolution layer with an `Embedding` layer.

In [None]:
# A integer input for vocab indices.
inputs = keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = keras.layers.Embedding(MAX_FEATURES, EMBEDDING_DIM)(inputs)
x = keras.layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = keras.layers.Conv1D(EMBEDDING_DIM, 7, padding="valid", activation="relu", strides=3)(x)
x = keras.layers.Conv1D(EMBEDDING_DIM, 7, padding="valid", activation="relu", strides=3)(x)
x = keras.layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = keras.layers.Dense(EMBEDDING_DIM, activation="relu")(x)
x = keras.layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = keras.layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

## Model Training

In [None]:
EPOCHS = 3
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

## Model Evaluation on Test Dataset

In [None]:
model.evaluate(test_ds)

## Creating an End-to-End Model
A model to process raw strings.

In [None]:
# A string input
inputs = keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = model(indices)

# Our end to end model
end_to_end_model = keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)

# Test it with `raw_test_ds`, which yields raw strings
end_to_end_model.evaluate(raw_test_ds)