In [7]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file(
    "aclImdb_v1",url,
    untar=True,cache_dir='.',
    cache_subdir=''
)
dataset_dir = os.path.join(os.path.dirname(dataset),'aclImdb')

In [8]:
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [9]:
train_dir = os.path.join(dataset_dir,"train")
os.listdir(train_dir)


['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [10]:
shutil.rmtree(os.path.join(train_dir,"unsup"))

In [15]:
batch_size = 32
seed = 42
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [17]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [18]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
 'aclImdb/test',
 batch_size=batch_size)

Found 25000 files belonging to 2 classes.


In [27]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase,'<br />','')
    return tf.strings.regex_replace(
    stripped_html,
    f'[{re.escape(string.punctuation)}]',
    ''
    )

In [28]:
max_features = 10000
sequence_length = 250
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
    )

In [29]:
train_text = raw_train_ds.map(lambda x,y : x)
vectorize_layer.adapt(train_text)

In [31]:
def vectorize_text(text,label):
    text = tf.expand_dims(text,-1)
    return vectorize_layer(text),label

In [35]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)


In [36]:
embedding_dim = 16
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1,embedding_dim),
    layers.Conv1D(8,7,activation="relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(8,activation="relu"),
    layers.Dense(1)]
)
model.summary()


In [40]:
model.compile(
optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy']
)
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[
        tf.keras.callbacks.TensorBoard(log_dir="logs")
    ]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.5460 - loss: 0.6353 - val_accuracy: 0.8336 - val_loss: 0.3648
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8417 - loss: 0.3644 - val_accuracy: 0.8642 - val_loss: 0.3072
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8787 - loss: 0.2938 - val_accuracy: 0.8738 - val_loss: 0.2917
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8988 - loss: 0.2465 - val_accuracy: 0.8722 - val_loss: 0.2882
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9163 - loss: 0.2132 - val_accuracy: 0.8744 - val_loss: 0.2877
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9265 - loss: 0.1881 - val_accuracy: 0.8760 - val_loss: 0.2940
Epoch 7/10
[1m625/625[0m

<keras.src.callbacks.history.History at 0x1fe9f6ad090>

In [41]:
loss,accuracy = model.evaluate(test_ds)
print("Loss:",loss)
print("Accuracy:",accuracy)


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - accuracy: 0.8494 - loss: 0.3984
Loss: 0.4096298813819885
Accuracy: 0.8469600081443787


In [43]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])
export_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer="adam",metrics=['accuracy']
)



In [44]:
reviews = tf.constant(['The movie is very boring','A Good Movie','very bad','worst movie',
'Worst movie, boring' ])
print("## Inference")
res = export_model(reviews)
for review, val in zip(reviews, res):
    review = review.numpy().decode( )
    val = val.numpy().squeeze()
    print(f"{review:<30}:{val:>.3f}")

## Inference
The movie is very boring      :0.674
A Good Movie                  :0.892
very bad                      :0.891
worst movie                   :0.855
Worst movie, boring           :0.768
