In [1]:
import os
from shutil import copyfileobj
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from pathlib import Path
from urllib.request import urlopen
import string
import re

print(f"TF version: {tf.__version__}")

TF version: 2.14.0


In [2]:
# def archive(dir: Path):
#     with zipfile.ZipFile(f"{dir}.zip", "w", zipfile.ZIP_STORED) as zip_file:
#         for entry in dir.rglob("*"):
#             zip_file.write(entry, entry.relative_to(dir))
#
#
# def unarchive(file: Path):
#     with zipfile.ZipFile(file, "r") as zip_file:
#         zip_file.extractall(file.with_suffix(""))

In [3]:
def download_file(url, dataset_file_path):
    path = Path(dataset_file_path)
    os.makedirs(path.parent, exist_ok=True)
    if not path.exists():
        print(f"Downloading {path}")
        with urlopen(url) as fsrc, open(path, "wb") as fdst:
            copyfileobj(fsrc, fdst)
    else:
        print(f"File {path} exists")

In [26]:
dataset_path = "tmp/text_classification.csv"
batch_size = 2048
seed = 123
max_seq_length = 100
vocab_size = 20000

In [27]:
download_file("https://raw.githubusercontent.com/sealuzh/user_quality/master/csv_files/reviews.csv", dataset_path)

File tmp/text_classification.csv exists


In [28]:
dataset = tf.data.experimental.make_csv_dataset(
    dataset_path,
    batch_size=batch_size,
    # column_names=None,
    # column_defaults=None,
    label_name=None,
    select_columns=["review", "star"],
    # field_delim=",",
    # use_quote_delim=True,
    # na_value="",
    # header=True,
    num_epochs=1,
    shuffle=True,
    shuffle_buffer_size=10000,
    shuffle_seed=seed,
    # prefetch_buffer_size=None,
    # num_parallel_reads=None,
    # sloppy=False,
    # num_rows_for_inference=100,
    # compression_type=None,
    # ignore_errors=False,
    # encoding="utf-8"
).map(lambda x: (x["review"], float(x["star"])))

In [29]:
next(dataset.as_numpy_iterator())

(array([b'Fine', b'Good',
        b'Keeps Crashing i have the bios flash and startup and game keeps crashing',
        ...,
        b"like it but I'm trying to work the BIOS and make Dreamcast games work. Give me some help with this.",
        b"Good but the mic emulation no work good enough on nexus 5 seaman can rarely understand me an usually thinks I'm yelling at him",
        b'Nice Patelnavin'], dtype=object),
 array([5., 5., 1., ..., 4., 4., 5.], dtype=float32))

In [30]:
dataset = list(dataset.unbatch())

In [31]:
split_size = 0.9
train_dataset = dataset[:int(len(dataset) * split_size)]
validation_dataset = dataset[len(train_dataset):]
print(len(train_dataset), len(validation_dataset))

259258 28807


In [32]:
# TextVectorization layer is not supported by h5 model format

In [33]:
reviews = [review.numpy().decode("utf-8") for review, rating in dataset]

In [34]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)

In [35]:
def sliding_window(lst, width, step):
    assert width > 0
    assert step > 0
    windows = []
    i = 0
    windows.append(lst[:width])
    i += step
    while i + width <= len(lst):
        windows.append(lst[i:i + width])
        i += step
    return windows

In [36]:
def texts_to_sequences_with_ratings(dataset):
    reviews = [review.numpy().decode("utf-8") for review, rating in dataset]
    ratings = [rating for review, rating in dataset]
    sequences = tokenizer.texts_to_sequences(reviews)
    sequences2 = [(window, rating) for sequence, rating in zip(sequences, ratings)
                  for window in sliding_window(sequence, max_seq_length, 10)]
    # padding=post is required for cuDNN LSTM implementation
    padded = tf.keras.utils.pad_sequences(
        [x[0] for x in sequences2],
        maxlen=max_seq_length,
        padding="post",
        truncating="post",
    )
    return reviews, padded, [x[1] for x in sequences2]


train_reviews, train_reviews_padded, train_ratings = texts_to_sequences_with_ratings(train_dataset)
validation_reviews, validation_reviews_padded, validation_ratings = texts_to_sequences_with_ratings(validation_dataset)

In [37]:
len(train_reviews_padded)

261788

In [38]:
vocab_size = tokenizer.num_words
vocab_size

20000

In [39]:
# Plus OOV token and 0 index representing a mask
input_dim = vocab_size + 2

In [40]:
model = tf.keras.Sequential([
    # text_vectorization_layer,

    layers.Embedding(input_dim, 16, mask_zero=True, input_length=max_seq_length),
    layers.Bidirectional(layers.LSTM(32)),
    # tf.keras.layers.Flatten(),
    layers.Dense(16, activation="relu"),
    layers.Dense(1),
    # layers.Dense(class_count, activation="softmax", name=f"output")
])

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 16)           320032    
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                12544     
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 16)                1040      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 333633 (1.27 MB)
Trainable params: 333633 (1.27 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [42]:
model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3),
    loss="mse",
    metrics=["mae"]
)

In [43]:
# checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=f"tmp/checkpoints",
#     save_weights_only=False,
#     monitor=f"val_acc",
#     mode="max",
#     save_best_only=True,
# )
reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor=f"val_loss",
    mode="max",
)

In [None]:
epochs = 10
history = model.fit(
    x=np.array(train_reviews_padded),
    y=np.array(train_ratings),
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[reduce_lr_callback],
    validation_data=(np.array(validation_reviews_padded), np.array(validation_ratings)),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
 23/128 [====>.........................] - ETA: 6s - loss: 1.0179 - mae: 0.7339

In [None]:
list(zip(validation_reviews[:10], model(validation_reviews_padded[:10]), validation_ratings[:10]))

In [None]:
acc = history.history["mae"]
val_acc = history.history["val_mae"]

loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(epochs_range, acc, label="Training MAE")
plt.plot(epochs_range, val_acc, label="Validation MAE")
plt.legend(loc="lower left")
plt.title("Training and Validation MAE")

plt.subplot(2, 1, 2)
plt.plot(epochs_range, loss, label="Training Loss")
plt.plot(epochs_range, val_loss, label="Validation Loss")
plt.legend(loc="lower left")
plt.title("Training and Validation Loss")
plt.show()

In [None]:
model.save("tmp/model.h5", save_format="h5")