(word-encodings=)
# Encoding Text

## Download Movie Reviews Dataset


In [None]:
#!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 
#!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  5506k      0  0:00:14  0:00:14 --:--:-- 6935k02  0:00:51 1523k


In [2]:
!rm -r aclImdb/train/unsup

In [3]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb") 
val_dir = base_dir / "val" 
train_dir = base_dir / "train" 

for category in ("neg", "pos"):
    os.makedirs(val_dir / category) 
    files = os.listdir(train_dir / category) 
    random.Random(1337).shuffle(files) 
    num_val_samples = int(0.2 * len(files))

val_files = files[-num_val_samples:]
for fname in val_files:
    shutil.move(train_dir / category / fname, val_dir / category / fname)

In [36]:
from tensorflow import keras 

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory( "aclImdb/train", batch_size=batch_size ) 

val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size ) 

test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size )

Found 22500 files belonging to 2 classes.
Found 2500 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [25]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'While "Santa Claus Conquers the Martians" is usually cited as one of the worse films ever made, this Mexican-made film from 1959 is so bad it makes "SCCM" look like "It\'s a Wonderful Life." You have to wonder what the people who made this film were thinking; perhaps they meant it as a third-world allegory about capitalist greed and conspicuous consumption. Nah . . . They just weren\'t very good. The same production company made an even more disturbing version of "Little Red Riding Hood" in which the wolf\'s obsession with our heroine has unmistakable hints of pedophilia. (Perhaps this was the inspiration for "Freeway.") Back to "Santa Claus": instead of the North Pole, Jolly Old Saint Nicholas resides in a satellite in geosynchronous earth orbit (shades of "MST3K"); instead of elves his toys are made by children chosen from around the world; and he had sophis

## Encoding Text - Bag of Words

In [26]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization( 
    max_tokens=20000,
    output_mode="multi_hot",
) 
text_only_train_ds = train_ds.map(lambda x, y: x) 
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_1gram_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_1gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [27]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


In [28]:
from tensorflow import keras 
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"])
    
    return model

In [29]:
model = get_model() 
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "binary_1gram.weights.h5",   # any filename; .h5 is conventional
        save_best_only=True,
        save_weights_only=True,      # <—  key line
        verbose=1
    )
]

# --------------------------------------------------
# 3.  Fit
# --------------------------------------------------
model.fit(
    binary_1gram_train_ds.cache(),
    validation_data=binary_1gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

# --------------------------------------------------
# 4.  Reload the best weights into an *identical* model
# --------------------------------------------------
best_model = keras.models.clone_model(model)      # architecture only
best_model.compile(                              # ← compile it
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
best_model.load_weights("binary_1gram.weights.h5")  # load the saved weights

# Now it's ready for evaluation
test_loss, test_acc = best_model.evaluate(binary_1gram_test_ds)
print(f"Test acc: {test_acc:.3f}")


Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_15 (Dense)            (None, 16)                320016    
                                                                 
 dropout_8 (Dropout)         (None, 16)                0         
                                                                 
 dense_16 (Dense)            (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Epoch 1/10
Epoch 1: val_loss improved from inf to 0.34251, saving model to binary_1gram.weights.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.34251 to 0.31518, saving model to binary_1gram.weights.h5
Epoch 3/10
Epoch 3: val_loss did not improve from 0.31518
Epoch 4/10
Epoch 4: val_loss did not improve from 0.31518
Epoch 5/10
Epoch 5: val_loss did not improve from 0.31518
Epoch 6/10
Epoch 6: val_loss did not improve from 0.31518
Epoch 7/10
Epoch 7: val_loss did not improve from 0.31518
Epoch 8/10
Epoch 8: val_loss did not improve from 0.31518
Epoch 9/10
Epoch 9: val_loss did not improve from 0.31518
Epoch 10/10
Epoch 10: val_loss did not improve from 0.31518
Test acc: 0.885


## Bigram Encoding

In [30]:
text_vectorization = TextVectorization( 
    ngrams=2, 
    max_tokens=20000,
    output_mode="multi_hot",
)

In [31]:
text_vectorization.adapt(text_only_train_ds) 
binary_2gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_2gram_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
binary_2gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

model = get_model() 
model.summary() 

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "binary_2gram.weights.h5",   # any filename; .h5 is conventional
        save_best_only=True,
        save_weights_only=True,      # <—  key line
        verbose=1
    )
]

# --------------------------------------------------
# 3.  Fit
# --------------------------------------------------
model.fit(
    binary_2gram_train_ds.cache(),
    validation_data=binary_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

# --------------------------------------------------
# 4.  Reload the best weights into an *identical* model
# --------------------------------------------------
best_model = keras.models.clone_model(model)      # architecture only
best_model.compile(                              # ← compile it
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
best_model.load_weights("binary_2gram.weights.h5")  # load the saved weights

# Now it's ready for evaluation
test_loss, test_acc = best_model.evaluate(binary_2gram_test_ds)
print(f"Test acc: {test_acc:.3f}")


Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 20000)]           0         
                                                                 
 dense_17 (Dense)            (None, 16)                320016    
                                                                 
 dropout_9 (Dropout)         (None, 16)                0         
                                                                 
 dense_18 (Dense)            (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 1: val_loss improved from inf to 0.27960, saving model to binary_2gram.weights.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.27960 to 0.

In [None]:
text_vectorization = TextVectorization( 
    ngrams=2,
      max_tokens=20000,
      output_mode="tf_idf",
)


In [33]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
tfidf_2gram_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
tfidf_2gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)


model = get_model() 
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "tfidf_2gram.weights.h5",   # any filename; .h5 is conventional
        save_best_only=True,
        save_weights_only=True,      # <—  key line
        verbose=1
    )
]

# --------------------------------------------------
# 3.  Fit
# --------------------------------------------------
model.fit(
    tfidf_2gram_train_ds.cache(),
    validation_data=tfidf_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

# --------------------------------------------------
# 4.  Reload the best weights into an *identical* model
# --------------------------------------------------
best_model = keras.models.clone_model(model)      # architecture only
best_model.compile(                              # ← compile it
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
best_model.load_weights("tfidf_2gram.weights.h5")  # load the saved weights

# Now it's ready for evaluation
test_loss, test_acc = best_model.evaluate(tfidf_2gram_test_ds)
print(f"Test acc: {test_acc:.3f}")


Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 20000)]           0         
                                                                 
 dense_19 (Dense)            (None, 16)                320016    
                                                                 
 dropout_10 (Dropout)        (None, 16)                0         
                                                                 
 dense_20 (Dense)            (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 1: val_loss improved from inf to 0.25396, saving model to tfidf_2gram.weights.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.25396 to 0.

In [49]:
from tensorflow.keras import layers

max_length = 600 
max_tokens = 20000 
text_vectorization = layers.TextVectorization( max_tokens=max_tokens,
                                              output_mode="int",
                                              output_sequence_length=max_length,

) 

text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

int_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4) 
int_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [50]:

inputs = keras.Input(shape=(None,), dtype="int64") 
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs) 
x = layers.Bidirectional(layers.LSTM(32))(embedded) 
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()


callbacks = [
    keras.callbacks.ModelCheckpoint(
        "embeddings_bidir_gru.weights.h5",   # any filename; .h5 is conventional
        save_best_only=True,
        save_weights_only=True,      # <—  key line
        verbose=1
    )
]

model.fit(
    int_train_ds, 
    validation_data=int_val_ds, 
    epochs=10, 
    callbacks=callbacks) 


best_model = keras.models.clone_model(model)      # architecture only
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
best_model.load_weights("embeddings_bidir_gru.weights.h5")  # load the saved weights


print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Model: "model_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_3 (Bidirecti  (None, 64)                73984     
 onal)                                                           
                                                                 
 dropout_13 (Dropout)        (None, 64)                0         
                                                                 
 dense_23 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
__________________