In [6]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0 80.2M    0 49152    0     0  22444      0  1:02:28  0:00:02  1:02:26 22464
  0 80.2M    0  240k    0     0  79756      0  0:17:34  0:00:03  0:17:31 79792
  0 80.2M    0  816k    0     0   196k      0  0:06:57  0:00:04  0:06:53  197k
  1 80.2M    1 1552k    0     0   305k      0  0:04:29  0:00:05  0:04:24  310k
  2 80.2M    2 2448k    0     0   399k      0  0:03:25  0:00:06  0:03:19  501k
  4 80.2M    4 3600k    0     0   508k      0  0:02:41  0:00:07  0:02:34  726k
  6 80.2M    6 5232k    0     0   643k      0  0:02:07  0:00:08  0:01:59  989k
  8 80.2M    8 7104k    0     0   781k      0  0:01

In [9]:
import os, pathlib, shutil, random
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
 os.makedirs(val_dir / category)
 files = os.listdir(train_dir / category)
 random.Random(1337).shuffle(files)
 num_val_samples = int(0.2 * len(files))
 val_files = files[-num_val_samples:]
 for fname in val_files:
    shutil.move(train_dir / category / fname,
        val_dir / category / fname)

In [11]:
from tensorflow import keras
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
 "aclImdb/train", batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
 "aclImdb/val", batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
 "aclImdb/test", batch_size=batch_size)


Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [12]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'The absolute worst of the trilogy, not even comedy could save it ... Unrealistic, unbelievable, loud sudden sounds the only scary component. Really should have quit after one ... No. two was bad but not in the ball park compared to this disaster. Nev Campbell the only saving grace with the four votes given placed solely on her performance having to work under atrocious conditions.', shape=(), dtype=string)
targets[0]: tf.Tensor(2, shape=(), dtype=int32)


In [14]:
from tensorflow.keras.layers import TextVectorization
import re
import string
import tensorflow as tf
def custom_standardization_fn(string_tensor):
 lowercase_string = tf.strings.lower(string_tensor)
 return tf.strings.regex_replace(
     lowercase_string, f"[{re.escape(string.punctuation)}]", "")

def custom_split_fn(string_tensor):
 return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
 output_mode="int",
 standardize=custom_standardization_fn,
 split=custom_split_fn,
)


dataset = [
 "I write, erase, rewrite",
 "Erase again, and then",
 "A poppy blooms.",
]
text_vectorization.adapt(dataset)

In [15]:
text_vectorization = TextVectorization(
 max_tokens=20000,
 output_mode="multi_hot",
)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)


In [19]:
binary_1gram_train_ds = train_ds.map(
 lambda x, y: (text_vectorization(x), y),
 num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(
 lambda x, y: (text_vectorization(x), y),
 num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(
 lambda x, y: (text_vectorization(x), y),
 num_parallel_calls=4)

In [20]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(2, shape=(), dtype=int32)


In [23]:
from tensorflow import keras
from tensorflow.keras import layers
def get_model(max_tokens=20000, hidden_dim=16):
 inputs = keras.Input(shape=(max_tokens,))
 x = layers.Dense(hidden_dim, activation="relu")(inputs)
 x = layers.Dropout(0.5)(x)
 outputs = layers.Dense(1, activation="sigmoid")(x)
 model = keras.Model(inputs, outputs)
 model.compile(optimizer="rmsprop",
 loss="binary_crossentropy",
 metrics=["accuracy"])
 return model

In [25]:
model = get_model()
model.summary()
callbacks = [
 keras.callbacks.ModelCheckpoint("binary_1gram.keras",
 save_best_only=True)
]
model.fit(binary_1gram_train_ds.cache(),
validation_data=binary_1gram_val_ds.cache(),
 epochs=10,
 callbacks=callbacks)
model = keras.models.load_model("binary_1gram.keras")

print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.500


# TF and IDF
![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)