<a href="https://colab.research.google.com/github/dominiksakic/bagOfWords/blob/main/bagOfWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download and unzip data
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  24.7M      0  0:00:03  0:00:03 --:--:-- 24.7M


In [13]:
!rm -r aclImdb/train/unsup
!pip install keras-tuner -q

rm: cannot remove 'aclImdb/train/unsup': No such file or directory
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

- Goal is to produce a ensambled model where each model is is optimized by the keras tuner.
  - Transformer, Sequence to Seq, and Bag of Words

In [15]:
import os, pathlib, shutil, random

# Create a Validation set
base_dir = pathlib.Path('aclImdb')
val_dir = base_dir / 'val' # No need for string concat, thanks to pathlib
train_dir = base_dir / 'train'
for category in ("neg", "pos"):
  os.makedirs(val_dir / category, exist_ok=True)
  files = os.listdir(train_dir / category)
  random.Random(1337).shuffle(files)
  num_val_samples = int(0.2 *len(files))
  val_files = files[-num_val_samples:]
  for fname in val_files:
    shutil.move(train_dir / category /fname, val_dir / category / fname)

In [16]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/val', batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=batch_size)


Found 16000 files belonging to 2 classes.
Found 9000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [17]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot",
)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [18]:
from tensorflow import keras
from tensorflow.keras import layers

def build_model(hp):
  max_tokens = 20000

  # Tune the numer of units in the first layer
  hidden_dim = hp.Int("hidden_dim", min_value=16, max_value=128, step=16)

  # Tune the dropout
  dropout_rate = hp.Float("float", min_value=0.1, max_value=0.6, step=0.1)

  # Tune the optimizer
  optimizer = hp.Choice("optimizer", values=["adam", "rmsprop", "sgd"])

  inputs = keras.Input(shape=(max_tokens,))
  x = layers.Dense(hidden_dim, activation="relu")(inputs)
  x = layers.Dropout(dropout_rate)(x)
  outputs = layers.Dense(1, activation="sigmoid")(x)

  model = keras.Model(inputs, outputs)
  model.compile(optimizer=optimizer,
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
  return model

In [None]:
import keras_tuner as kt
tuner = kt.BayesianOptimization(
    build_model,
    objective="val_accuracy",
    max_trials=10,
    directory="my_dir",
    project_name="binary_2gram_bayes"
)

tuner.search(binary_2gram_train_ds.cache(),
             validation_data=binary_2gram_val_ds.cache(),
             epochs=10,
             callbacks=[
                 keras.callbacks.EarlyStopping(patience=3)
             ])

Trial 2 Complete [00h 00m 34s]
val_accuracy: 0.8980000019073486

Best val_accuracy So Far: 0.8980000019073486
Total elapsed time: 00h 01m 28s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
48                |16                |hidden_dim
0.5               |0.1               |float
sgd               |rmsprop           |optimizer

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.6297 - loss: 0.6456 - val_accuracy: 0.8301 - val_loss: 0.4740
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8136 - loss: 0.4600 - val_accuracy: 0.8499 - val_loss: 0.3823
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.8466 - loss: 0.3799 - val_accuracy: 0.8598 - val_loss: 0.3425
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.8693 - loss: 0.3279 - val_accuracy: 0.8681 -

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras", save_best_only=True)
]

best_model.fit(binary_2gram_train_ds.cache(),
               validation_data=binary_2gram_val_ds.cache(),
               epochs=10,
               callbacks=callbacks)

# Load and evaluate
model = keras.models.load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")


In [None]:
# Get the best hyperparameters
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]


print("\nBest hyperparameters:")
for param in best_hp.values:
    print(f"{param}: {best_hp.get(param)}")
