In [1]:
import os
import re
import string
from pathlib import Path
from shutil import copyfileobj
from urllib.request import urlopen

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

import zipfile

print(f"TF version: {tf.__version__}")

TF version: 2.14.0


In [2]:
# def archive(dir: Path):
#     with zipfile.ZipFile(f"{dir}.zip", "w", zipfile.ZIP_STORED) as zip_file:
#         for entry in dir.rglob("*"):
#             zip_file.write(entry, entry.relative_to(dir))
# 
# 
# def unarchive(file: Path):
#     with zipfile.ZipFile(file, "r") as zip_file:
#         zip_file.extractall(file.with_suffix(""))

In [3]:
def download_file(url, dataset_file_path):
    path = Path(dataset_file_path)
    os.makedirs(path.parent, exist_ok=True)
    if not path.exists():
        print(f"Downloading {path}")
        with urlopen(url) as fsrc, open(path, "wb") as fdst:
            copyfileobj(fsrc, fdst)
    else:
        print(f"File {path} exists")

In [33]:
dataset_path = "tmp/text_classification.csv"
batch_size = 32
seed = 123
max_seq_length = 100
vocab_size = 20000

In [5]:
download_file("https://raw.githubusercontent.com/sealuzh/user_quality/master/csv_files/reviews.csv", dataset_path)

Downloading tmp\text_classification.csv


In [6]:
dataset = tf.data.experimental.make_csv_dataset(
    dataset_path,
    batch_size=batch_size,
    # column_names=None,
    # column_defaults=None,
    label_name=None,
    select_columns=["review", "star"],
    # field_delim=",",
    # use_quote_delim=True,
    # na_value="",
    # header=True,
    num_epochs=1,
    shuffle=True,
    shuffle_buffer_size=10000,
    shuffle_seed=seed,
    # prefetch_buffer_size=None,
    # num_parallel_reads=None,
    # sloppy=False,
    # num_rows_for_inference=100,
    # compression_type=None,
    # ignore_errors=False,
    # encoding="utf-8"
).map(lambda x: (x["review"], float(x["star"])))

In [7]:
next(dataset.as_numpy_iterator())

(array([b'fine', b'good',
        b'keeps crashing i have the bios flash and startup and game keeps crashing',
        b'cake calling it a roguelike is a bit of exaggeration  altough its definitely the best rpg on droid  too bad its so short',
        b'file manager very very very good',
        b'fairly well optimized im either underestimating hardware advancements or simply pleased with the optimization of this emulator running sonic adventure 2 on a cheap 30 amazon fire a little bit of frame rate and audio issues but still great',
        b'hate it', b'solved my issue nicely done good job guys',
        b'heavy ram usage and battery drain this is a great extention for dashclock thanks for your effort really appreciate if you could kindly tweak the heavy battery drain and ram usage',
        b'kind of good in sonic adventure 1 is all great and good but for sonic adventure 2  with knuckles i cant defeat king boom boo because the emulator doesnt show the shadow when he hides from the l

In [8]:
dataset = list(dataset.unbatch())

In [9]:
split_size = 0.9
train_dataset = dataset[:int(len(dataset) * split_size)]
validation_dataset = dataset[len(train_dataset):]
print(len(train_dataset), len(validation_dataset))

259258 28807


In [10]:
# TextVectorization layer is not supported by h5 model format

In [11]:
reviews = [review.numpy().decode("utf-8") for review, rating in dataset]

In [12]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)

In [31]:
def sliding_window(lst, width, step):
    assert width > 0
    assert step > 0
    windows = []
    i = 0
    windows.append(lst[:width])
    i += step
    while i + width <= len(lst):
        windows.append(lst[i:i + width])
        i += step
    return windows
sliding_window([1, 2, 3, 4, 5, 6, 7], 3, 4)

In [14]:
def texts_to_sequences_with_ratings(dataset):
    reviews = [review.numpy().decode("utf-8") for review, rating in dataset]
    ratings = [rating for review, rating in dataset]
    sequences = tokenizer.texts_to_sequences(reviews)
    sequences2 = [(window, rating) for sequence, rating in zip(sequences, ratings)
                  for window in sliding_window(sequence, max_seq_length, 10)]
    # padding=post is required for cuDNN LSTM implementation
    padded = tf.keras.utils.pad_sequences(
        [x[0] for x in sequences2],
        maxlen=max_seq_length,
        padding="post",
        truncating="post",
    )
    return reviews, padded, [x[1] for x in sequences2]


train_reviews, train_reviews_padded, train_ratings = texts_to_sequences_with_ratings(train_dataset)
validation_reviews, validation_reviews_padded, validation_ratings = texts_to_sequences_with_ratings(validation_dataset)

In [14]:
vocab_size = tokenizer.num_words
vocab_size

10000

In [15]:
# Plus OOV token and 0 index representing a mask
input_dim = vocab_size + 2

In [16]:
# TextVectorization layer is not supported by h5 model format needed for the exam
# text_vectorization_layer = tf.keras.layers.TextVectorization(
#     # ngrams=3,
#     # output_sequence_length=5,
# )

In [17]:
# text_vectorization_layer.adapt(
#     train_dataset.concatenate(validation_dataset).map(lambda review, label: review)
# )

In [18]:
# text_vectorization_layer.get_vocabulary()

In [19]:
# text_vectorization_layer(
#     np.array(["one two three four five six seven", "two", "three"])
# )

In [20]:
# vocab_size = len(text_vectorization_layer.get_vocabulary())
# vocab_size

In [21]:
# input_dim = vocab_size

In [22]:
model = tf.keras.Sequential([
    # text_vectorization_layer,

    layers.Embedding(input_dim, 16, mask_zero=True, input_length=max_seq_length),
    layers.Bidirectional(layers.LSTM(32)),
    # tf.keras.layers.Flatten(),
    layers.Dense(16, activation="relu"),
    layers.Dense(1),
    # layers.Dense(class_count, activation="softmax", name=f"output")
])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160032    
                                                                 
 bidirectional (Bidirection  (None, 64)                12544     
 al)                                                             
                                                                 
 dense (Dense)               (None, 16)                1040      
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 173633 (678.25 KB)
Trainable params: 173633 (678.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3),
    loss="mse",
    metrics=["mae"]
)

In [25]:
# checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=f"tmp/checkpoints",
#     save_weights_only=False,
#     monitor=f"val_acc",
#     mode="max",
#     save_best_only=True,
# )
reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor=f"val_loss",
    mode="max",
)

In [None]:
epochs = 3
history = model.fit(
    x=np.array(train_reviews_padded),
    y=np.array(train_ratings),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[reduce_lr_callback],
    validation_data=(np.array(validation_reviews_padded), np.array(validation_ratings)),
)

Epoch 1/3

```
Original

Epoch 1/3
8102/8102 [==============================] - 148s 17ms/step - loss: 1.2404 - mae: 0.7918 - val_loss: 1.0588 - val_mae: 0.7061 - lr: 0.0010
Epoch 2/3
8102/8102 [==============================] - 97s 12ms/step - loss: 0.9636 - mae: 0.6815 - val_loss: 1.0458 - val_mae: 0.7251 - lr: 0.0010
Epoch 3/3
8102/8102 [==============================] - 93s 11ms/step - loss: 0.9083 - mae: 0.6534 - val_loss: 1.0417 - val_mae: 0.7045 - lr: 0.0010



With sliding_window of width=30

Epoch 1/3
26639/26639 [==============================] - 284s 10ms/step - loss: 1.0025 - mae: 0.7239 - val_loss: 1.7576 - val_mae: 0.9909 - lr: 0.0010
Epoch 2/3
26639/26639 [==============================] - 242s 9ms/step - loss: 0.5891 - mae: 0.5121 - val_loss: 1.9097 - val_mae: 1.0091 - lr: 0.0010
Epoch 3/3
26639/26639 [==============================] - 249s 9ms/step - loss: 0.4590 - mae: 0.4290 - val_loss: 1.8785 - val_mae: 0.9827 - lr: 0.0010



With sliding_window of width=100, step=10

Epoch 1/3
8178/8178 [==============================] - 165s 19ms/step - loss: 1.2933 - mae: 0.8060 - val_loss: 1.0728 - val_mae: 0.7669 - lr: 0.0010
Epoch 2/3
8178/8178 [==============================] - 106s 13ms/step - loss: 0.9541 - mae: 0.6773 - val_loss: 1.0577 - val_mae: 0.7357 - lr: 0.0010
Epoch 3/3
8178/8178 [==============================] - 105s 13ms/step - loss: 0.8883 - mae: 0.6438 - val_loss: 1.0535 - val_mae: 0.6935 - lr: 0.0010

```

In [None]:
list(zip(validation_reviews[:10], model(validation_reviews_padded[:10]), validation_ratings[:10]))

In [None]:
# epochs = 1
# history = model.fit(
#     train_dataset,
#     validation_data=validation_dataset,
#     epochs=epochs,
#     callbacks=[reduce_lr_callback],
# )

In [None]:
acc = history.history["mae"]
val_acc = history.history["val_mae"]

loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(epochs_range, acc, label="Training MAE")
plt.plot(epochs_range, val_acc, label="Validation MAE")
plt.legend(loc="lower left")
plt.title("Training and Validation MAE")

plt.subplot(2, 1, 2)
plt.plot(epochs_range, loss, label="Training Loss")
plt.plot(epochs_range, val_loss, label="Validation Loss")
plt.legend(loc="lower left")
plt.title("Training and Validation Loss")
plt.show()

In [None]:
model.save("tmp/model.h5", save_format="h5")